Foppe Hemminga před 6 roky
revize
43c4313679
6 změnil soubory, kde provedl 307 přidání a 0 odebrání
  1. 95 0
      _bs.py
  2. 17 0
      _html.py
  3. 14 0
      database.py
  4. 48 0
      main.py
  5. 123 0
      model.py
  6. 10 0
      requirements.txt

+ 95 - 0
_bs.py

@@ -0,0 +1,95 @@
+from bs4 import BeautifulSoup
+import re
+# import datetime
+# from pytz import timezone
+import model
+import collections
+# # import pytz
+# from pprint import pprint
+
+
+"""
+This module contains custom methods based on bs4.beautifulsoup to analyze data
+"""
+
+base_url = 'https://racingaustralia.horse/FreeFields/'
+Venue = collections.namedtuple('Venue', 'state, name')
+RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'program_url'))
+# noinspection PyProtectedMember,PyUnresolvedReferences
+RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
+    'scratchings_latest_datetime', 'scratchings_latest_unixtime',
+    'scratchings_close_datetime', 'scratchings_close_unixtime'))
+Scratching = collections.namedtuple('Scratching', 'venue state date race horse')
+
+
+def get_today_row(this_text, this_row):
+    """
+    Traverses the main table on the front page of https://racingaustralia.horse.
+    This function scrapes Venue information and race day information.
+    Unfortunately there is no clever way to split this function into two parts.
+    :param this_text:
+    :param this_row:
+    :return RaceDay this_race_day:
+    """
+    this_soup = BeautifulSoup(this_text, 'html.parser')
+    rows = this_soup.select('tr.rows')
+    # print('len(rows) {}'.format(len(rows)))
+    all_race_days = []
+    days_to_check = [this_row]
+    if this_row == -1:
+        days_to_check = range(len(rows))
+    for day in days_to_check:
+        my_row = rows[day]
+        cells = my_row.select('td')
+        i = 0
+        states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
+        day = 'Unknown'
+        for cell in cells:
+            if i == 0:
+                # First cell contains date information
+                day = cell.find('span').getText()
+                # print("date: {}".format(day))
+                i += 1
+                continue
+            venue_text = cell.find('p').getText().strip()
+            if len(venue_text) > 0:
+                # Cell is not empty
+                # print(venue_text)
+                this_a = cell.findAll('a')  # .get('href')
+                for a in this_a:
+                    # There may be several links in a cell (which represents a state)
+                    venue_name = a.getText().strip()
+                    this_venue = Venue(states[i - 1], venue_name)
+                    date_string = day
+                    this_url = a.get('href')
+                    if this_url:
+                        # Create the Scratchings URL by substitution
+                        program_url = re.sub(r"/(.*)\.aspx", 'RaceProgram.aspx', this_url)
+                        program_url = base_url + program_url
+                        calculated_date = model.convert_to_date(date_string)
+                        this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
+                                                     calculated_date, program_url)
+                        all_race_days.append(this_race_day)
+            i += 1
+    return all_race_days
+
+
+def separate_races(program_html):
+    """
+    Get the description line for each of the races from the html
+    :rtype: object
+    :param program_html:
+    :return:
+    """
+    this_soup = BeautifulSoup(program_html, 'html.parser')
+    table_blocks = this_soup.select('table.race-title')
+    # print(len(table_blocks))
+    races = []
+    for table in table_blocks:
+        titles = table.select('a.race-title-anchor-3')
+        for title in titles:
+            this_line = title.getText()
+            race_match = re.search(r'^Race (\d+) - (\d{1,2}:\d{2}[AP]M) ', this_line)
+            if race_match:
+                races.append((race_match.group(1), race_match.group(2)))
+    return races

+ 17 - 0
_html.py

@@ -0,0 +1,17 @@
+import requests
+
+
+"""
+This module contains methods to retrieve pages
+"""
+
+
+def get_page(this_url):
+    """
+    Simple utility gets the contents of a webpage
+    @TODO Add Try/Catch blocks. Handle http errors.
+    :param this_url:
+    :return:
+    """
+    data = requests.get(this_url)
+    return data.text

+ 14 - 0
database.py

@@ -0,0 +1,14 @@
+import dotenv
+import os
+import psycopg2
+
+
+# Develop only
+dotenv.load_dotenv()
+
+host = os.environ["STOCK_DB_HOST"]
+port = os.environ["STOCK_DB_PORT"]
+database = os.environ["STOCK_DB_DATABASE"]
+user = os.environ["STOCK_DB_USER"]
+password = os.environ["STOCK_DB_PASSWD"]
+db = psycopg2.connect(host=host, port=port, database=database, user=user, password=password)

+ 48 - 0
main.py

@@ -0,0 +1,48 @@
+# import psycopg2.extras
+# import datetime
+
+import model
+from pprint import pprint
+import time
+# pwd
+import database
+import sys
+# import view
+
+if __name__ == '__main__':
+    row = -1
+    if len(sys.argv) > 1:
+        try:
+            row = int(sys.argv[1])
+        except ValueError:
+            sys.exit(1)
+    broadcast = True
+    if len(sys.argv) > 2:
+        broadcast = False
+    start = time.time()
+
+    db = database.db
+    cursor = db.cursor()
+
+    race_days_global = model.scrape_main_page()
+    pprint(race_days_global)
+    # model.setup_database()
+    for venue in race_days_global:
+        raw_data = model.get_program_data(venue.program_url)
+        for line in raw_data:
+            query = """
+                INSERT INTO race_program (
+                    race_date, venue, race, start_time, utctime, state)
+                VALUES (%s, %s, %s, %s, %s, %s)
+                ON CONFLICT (race_date, venue, race) DO UPDATE SET
+                    race_date = excluded.race_date,
+                    venue = excluded.venue,
+                    race = excluded.race
+            """
+            timestamp = model.convert_to_tz_aware_datetime(venue.date, line[1], venue.state)
+            cursor.execute(query, (venue.date,  venue.name, line[0], line[1], timestamp, venue.state))
+    model.create_json(db)
+
+    db.commit()
+    cursor.close()
+    db.close()

+ 123 - 0
model.py

@@ -0,0 +1,123 @@
+import datetime
+import re
+
+import pytz
+
+import _html
+import _bs
+import database
+import psycopg2.extras
+
+from pprint import pprint
+
+
+local_timezones = {
+    "NSW": "Australia/Sydney",
+    "VIC": "Australia/Melbourne",
+    "QLD": "Australia/Brisbane",
+    "WA": "Australia/Perth",
+    "SA": "Australia/Adelaide",
+    "TAS": "Australia/Hobart",
+    "ACT": "Australia/Sydney",
+    "NT": "Australia/Darwin"}
+
+
+def scrape_main_page(row=-1):
+    """
+    Scrapes the main page of Racing Australia and returns the
+    venues for this weeks races
+    :param row:
+    :return list of RaceDayShort namedtuples:
+    """
+    this_url = """https://racingaustralia.horse/Home.aspx"""
+    this_data = _html.get_page(this_url)
+    venues_all = _bs.get_today_row(this_data, row)
+    return venues_all
+
+
+def convert_to_date(weird_string):
+    """
+    Converts a string like 'MONDAY 15 JUL' to a python datetime object
+    :param weird_string:
+    :return datetime object:
+    """
+    weird_string = re.sub(r' (\d) ', ' 0\1 ', weird_string)
+    local_timezone = pytz.timezone('Australia/Sydney')
+    now = datetime.datetime.now(local_timezone)
+    calculated_date = datetime.datetime.strptime(str(now.year) + ' ' + weird_string, "%Y %A %d %b").date()
+    # print(calculated_date)
+    return calculated_date
+
+
+def convert_to_tz_aware_datetime(date, time, state):
+    """
+    Creates a datetime object to be stored as timestamptz in PostgreSQL
+    :param date:
+    :param time:
+    :param state:
+    :return timestamp:
+    """
+    tz = pytz.timezone(local_timezones[state])
+    am_or_pm = time[-2:].lower()
+    # print(am_or_pm)
+    time_match = re.match(r'^(\d{1,2}):(\d{2})[AP]M$', time)
+    hour = 0
+    minute = 0
+    if time_match:
+        hour = int(time_match.group(1))
+        minute = int(time_match.group(2))
+        if am_or_pm == 'pm':
+            hour = (hour % 12) + 12
+    timestamp = datetime.datetime(date.year, date.month, date.day, hour, minute, 0, 0)
+    locale_aware_timestamp = tz.localize(timestamp)
+    return locale_aware_timestamp
+
+
+def get_program_data(this_url):
+    """
+    Retrieve the page from this_url
+    :param this_url:
+    :return:
+    """
+    program_page = _html.get_page(this_url)
+    races = _bs.separate_races(program_page)
+    pprint(races)
+    return races
+
+
+def create_json(this_db):
+    """
+    Creates a json file with today's race data
+    :return:
+    """
+    query = """
+    SELECT venue, race, start_time, utctime, state
+    FROM race_program
+    WHERE race_date = %s;"""
+    local_timezone = pytz.timezone('Australia/Sydney')
+    now = datetime.datetime.now(local_timezone)
+    cursor = this_db.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
+
+
+def setup_database():
+    """
+    Set up for the database table
+    :return:
+    """
+    query = """
+    CREATE TABLE IF NOT EXISTS race_program (
+        id SERIAL,
+        race_date DATE NOT NULL,
+        venue TEXT NOT NULL,
+        state TEXT,
+        race INTEGER,
+        start_time TIME,
+        utctime TIMESTAMP WITH TIME ZONE,
+        UNIQUE (race_date, venue, race));
+    """
+    db = database.db
+    cursor = db.cursor()
+    cursor.execute(query)
+    db.commit()
+    cursor.close()
+    db.close()

+ 10 - 0
requirements.txt

@@ -0,0 +1,10 @@
+beautifulsoup4==4.7.1
+certifi==2019.6.16
+chardet==3.0.4
+idna==2.8
+psycopg2==2.8.3
+python-dotenv==0.10.3
+pytz==2019.1
+requests==2.22.0
+soupsieve==1.9.2
+urllib3==1.25.3