před 6 roky · 43c4313679
--- a/_bs.py
+++ b/_bs.py
@@ -0,0 +1,95 @@
 
				+from bs4 import BeautifulSoup
			
 
				+import re
			
 
				+# import datetime
			
 
				+# from pytz import timezone
			
 
				+import model
			
 
				+import collections
			
 
				+# # import pytz
			
 
				+# from pprint import pprint
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+This module contains custom methods based on bs4.beautifulsoup to analyze data
			
 
				+"""
			
 
				+
			
 
				+base_url = 'https://racingaustralia.horse/FreeFields/'
			
 
				+Venue = collections.namedtuple('Venue', 'state, name')
			
 
				+RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'program_url'))
			
 
				+# noinspection PyProtectedMember,PyUnresolvedReferences
			
 
				+RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
			
 
				+    'scratchings_latest_datetime', 'scratchings_latest_unixtime',
			
 
				+    'scratchings_close_datetime', 'scratchings_close_unixtime'))
			
 
				+Scratching = collections.namedtuple('Scratching', 'venue state date race horse')
			
 
				+
			
 
				+
			
 
				+def get_today_row(this_text, this_row):
			
 
				+    """
			
 
				+    Traverses the main table on the front page of https://racingaustralia.horse.
			
 
				+    This function scrapes Venue information and race day information.
			
 
				+    Unfortunately there is no clever way to split this function into two parts.
			
 
				+    :param this_text:
			
 
				+    :param this_row:
			
 
				+    :return RaceDay this_race_day:
			
 
				+    """
			
 
				+    this_soup = BeautifulSoup(this_text, 'html.parser')
			
 
				+    rows = this_soup.select('tr.rows')
			
 
				+    # print('len(rows) {}'.format(len(rows)))
			
 
				+    all_race_days = []
			
 
				+    days_to_check = [this_row]
			
 
				+    if this_row == -1:
			
 
				+        days_to_check = range(len(rows))
			
 
				+    for day in days_to_check:
			
 
				+        my_row = rows[day]
			
 
				+        cells = my_row.select('td')
			
 
				+        i = 0
			
 
				+        states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
			
 
				+        day = 'Unknown'
			
 
				+        for cell in cells:
			
 
				+            if i == 0:
			
 
				+                # First cell contains date information
			
 
				+                day = cell.find('span').getText()
			
 
				+                # print("date: {}".format(day))
			
 
				+                i += 1
			
 
				+                continue
			
 
				+            venue_text = cell.find('p').getText().strip()
			
 
				+            if len(venue_text) > 0:
			
 
				+                # Cell is not empty
			
 
				+                # print(venue_text)
			
 
				+                this_a = cell.findAll('a')  # .get('href')
			
 
				+                for a in this_a:
			
 
				+                    # There may be several links in a cell (which represents a state)
			
 
				+                    venue_name = a.getText().strip()
			
 
				+                    this_venue = Venue(states[i - 1], venue_name)
			
 
				+                    date_string = day
			
 
				+                    this_url = a.get('href')
			
 
				+                    if this_url:
			
 
				+                        # Create the Scratchings URL by substitution
			
 
				+                        program_url = re.sub(r"/(.*)\.aspx", 'RaceProgram.aspx', this_url)
			
 
				+                        program_url = base_url + program_url
			
 
				+                        calculated_date = model.convert_to_date(date_string)
			
 
				+                        this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
			
 
				+                                                     calculated_date, program_url)
			
 
				+                        all_race_days.append(this_race_day)
			
 
				+            i += 1
			
 
				+    return all_race_days
			
 
				+
			
 
				+
			
 
				+def separate_races(program_html):
			
 
				+    """
			
 
				+    Get the description line for each of the races from the html
			
 
				+    :rtype: object
			
 
				+    :param program_html:
			
 
				+    :return:
			
 
				+    """
			
 
				+    this_soup = BeautifulSoup(program_html, 'html.parser')
			
 
				+    table_blocks = this_soup.select('table.race-title')
			
 
				+    # print(len(table_blocks))
			
 
				+    races = []
			
 
				+    for table in table_blocks:
			
 
				+        titles = table.select('a.race-title-anchor-3')
			
 
				+        for title in titles:
			
 
				+            this_line = title.getText()
			
 
				+            race_match = re.search(r'^Race (\d+) - (\d{1,2}:\d{2}[AP]M) ', this_line)
			
 
				+            if race_match:
			
 
				+                races.append((race_match.group(1), race_match.group(2)))
			
 
				+    return races
			
--- a/_html.py
+++ b/_html.py
@@ -0,0 +1,17 @@
 
				+import requests
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+This module contains methods to retrieve pages
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+def get_page(this_url):
			
 
				+    """
			
 
				+    Simple utility gets the contents of a webpage
			
 
				+    @TODO Add Try/Catch blocks. Handle http errors.
			
 
				+    :param this_url:
			
 
				+    :return:
			
 
				+    """
			
 
				+    data = requests.get(this_url)
			
 
				+    return data.text
			
--- a/database.py
+++ b/database.py
@@ -0,0 +1,14 @@
 
				+import dotenv
			
 
				+import os
			
 
				+import psycopg2
			
 
				+
			
 
				+
			
 
				+# Develop only
			
 
				+dotenv.load_dotenv()
			
 
				+
			
 
				+host = os.environ["STOCK_DB_HOST"]
			
 
				+port = os.environ["STOCK_DB_PORT"]
			
 
				+database = os.environ["STOCK_DB_DATABASE"]
			
 
				+user = os.environ["STOCK_DB_USER"]
			
 
				+password = os.environ["STOCK_DB_PASSWD"]
			
 
				+db = psycopg2.connect(host=host, port=port, database=database, user=user, password=password)
			
--- a/main.py
+++ b/main.py
@@ -0,0 +1,48 @@
 
				+# import psycopg2.extras
			
 
				+# import datetime
			
 
				+
			
 
				+import model
			
 
				+from pprint import pprint
			
 
				+import time
			
 
				+# pwd
			
 
				+import database
			
 
				+import sys
			
 
				+# import view
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    row = -1
			
 
				+    if len(sys.argv) > 1:
			
 
				+        try:
			
 
				+            row = int(sys.argv[1])
			
 
				+        except ValueError:
			
 
				+            sys.exit(1)
			
 
				+    broadcast = True
			
 
				+    if len(sys.argv) > 2:
			
 
				+        broadcast = False
			
 
				+    start = time.time()
			
 
				+
			
 
				+    db = database.db
			
 
				+    cursor = db.cursor()
			
 
				+
			
 
				+    race_days_global = model.scrape_main_page()
			
 
				+    pprint(race_days_global)
			
 
				+    # model.setup_database()
			
 
				+    for venue in race_days_global:
			
 
				+        raw_data = model.get_program_data(venue.program_url)
			
 
				+        for line in raw_data:
			
 
				+            query = """
			
 
				+                INSERT INTO race_program (
			
 
				+                    race_date, venue, race, start_time, utctime, state)
			
 
				+                VALUES (%s, %s, %s, %s, %s, %s)
			
 
				+                ON CONFLICT (race_date, venue, race) DO UPDATE SET
			
 
				+                    race_date = excluded.race_date,
			
 
				+                    venue = excluded.venue,
			
 
				+                    race = excluded.race
			
 
				+            """
			
 
				+            timestamp = model.convert_to_tz_aware_datetime(venue.date, line[1], venue.state)
			
 
				+            cursor.execute(query, (venue.date,  venue.name, line[0], line[1], timestamp, venue.state))
			
 
				+    model.create_json(db)
			
 
				+
			
 
				+    db.commit()
			
 
				+    cursor.close()
			
 
				+    db.close()
			
--- a/model.py
+++ b/model.py
@@ -0,0 +1,123 @@
 
				+import datetime
			
 
				+import re
			
 
				+
			
 
				+import pytz
			
 
				+
			
 
				+import _html
			
 
				+import _bs
			
 
				+import database
			
 
				+import psycopg2.extras
			
 
				+
			
 
				+from pprint import pprint
			
 
				+
			
 
				+
			
 
				+local_timezones = {
			
 
				+    "NSW": "Australia/Sydney",
			
 
				+    "VIC": "Australia/Melbourne",
			
 
				+    "QLD": "Australia/Brisbane",
			
 
				+    "WA": "Australia/Perth",
			
 
				+    "SA": "Australia/Adelaide",
			
 
				+    "TAS": "Australia/Hobart",
			
 
				+    "ACT": "Australia/Sydney",
			
 
				+    "NT": "Australia/Darwin"}
			
 
				+
			
 
				+
			
 
				+def scrape_main_page(row=-1):
			
 
				+    """
			
 
				+    Scrapes the main page of Racing Australia and returns the
			
 
				+    venues for this weeks races
			
 
				+    :param row:
			
 
				+    :return list of RaceDayShort namedtuples:
			
 
				+    """
			
 
				+    this_url = """https://racingaustralia.horse/Home.aspx"""
			
 
				+    this_data = _html.get_page(this_url)
			
 
				+    venues_all = _bs.get_today_row(this_data, row)
			
 
				+    return venues_all
			
 
				+
			
 
				+
			
 
				+def convert_to_date(weird_string):
			
 
				+    """
			
 
				+    Converts a string like 'MONDAY 15 JUL' to a python datetime object
			
 
				+    :param weird_string:
			
 
				+    :return datetime object:
			
 
				+    """
			
 
				+    weird_string = re.sub(r' (\d) ', ' 0\1 ', weird_string)
			
 
				+    local_timezone = pytz.timezone('Australia/Sydney')
			
 
				+    now = datetime.datetime.now(local_timezone)
			
 
				+    calculated_date = datetime.datetime.strptime(str(now.year) + ' ' + weird_string, "%Y %A %d %b").date()
			
 
				+    # print(calculated_date)
			
 
				+    return calculated_date
			
 
				+
			
 
				+
			
 
				+def convert_to_tz_aware_datetime(date, time, state):
			
 
				+    """
			
 
				+    Creates a datetime object to be stored as timestamptz in PostgreSQL
			
 
				+    :param date:
			
 
				+    :param time:
			
 
				+    :param state:
			
 
				+    :return timestamp:
			
 
				+    """
			
 
				+    tz = pytz.timezone(local_timezones[state])
			
 
				+    am_or_pm = time[-2:].lower()
			
 
				+    # print(am_or_pm)
			
 
				+    time_match = re.match(r'^(\d{1,2}):(\d{2})[AP]M$', time)
			
 
				+    hour = 0
			
 
				+    minute = 0
			
 
				+    if time_match:
			
 
				+        hour = int(time_match.group(1))
			
 
				+        minute = int(time_match.group(2))
			
 
				+        if am_or_pm == 'pm':
			
 
				+            hour = (hour % 12) + 12
			
 
				+    timestamp = datetime.datetime(date.year, date.month, date.day, hour, minute, 0, 0)
			
 
				+    locale_aware_timestamp = tz.localize(timestamp)
			
 
				+    return locale_aware_timestamp
			
 
				+
			
 
				+
			
 
				+def get_program_data(this_url):
			
 
				+    """
			
 
				+    Retrieve the page from this_url
			
 
				+    :param this_url:
			
 
				+    :return:
			
 
				+    """
			
 
				+    program_page = _html.get_page(this_url)
			
 
				+    races = _bs.separate_races(program_page)
			
 
				+    pprint(races)
			
 
				+    return races
			
 
				+
			
 
				+
			
 
				+def create_json(this_db):
			
 
				+    """
			
 
				+    Creates a json file with today's race data
			
 
				+    :return:
			
 
				+    """
			
 
				+    query = """
			
 
				+    SELECT venue, race, start_time, utctime, state
			
 
				+    FROM race_program
			
 
				+    WHERE race_date = %s;"""
			
 
				+    local_timezone = pytz.timezone('Australia/Sydney')
			
 
				+    now = datetime.datetime.now(local_timezone)
			
 
				+    cursor = this_db.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
			
 
				+
			
 
				+
			
 
				+def setup_database():
			
 
				+    """
			
 
				+    Set up for the database table
			
 
				+    :return:
			
 
				+    """
			
 
				+    query = """
			
 
				+    CREATE TABLE IF NOT EXISTS race_program (
			
 
				+        id SERIAL,
			
 
				+        race_date DATE NOT NULL,
			
 
				+        venue TEXT NOT NULL,
			
 
				+        state TEXT,
			
 
				+        race INTEGER,
			
 
				+        start_time TIME,
			
 
				+        utctime TIMESTAMP WITH TIME ZONE,
			
 
				+        UNIQUE (race_date, venue, race));
			
 
				+    """
			
 
				+    db = database.db
			
 
				+    cursor = db.cursor()
			
 
				+    cursor.execute(query)
			
 
				+    db.commit()
			
 
				+    cursor.close()
			
 
				+    db.close()
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,10 @@
 
				+beautifulsoup4==4.7.1
			
 
				+certifi==2019.6.16
			
 
				+chardet==3.0.4
			
 
				+idna==2.8
			
 
				+psycopg2==2.8.3
			
 
				+python-dotenv==0.10.3
			
 
				+pytz==2019.1
			
 
				+requests==2.22.0
			
 
				+soupsieve==1.9.2
			
 
				+urllib3==1.25.3