6 lat temu · 8f8ac0c518
--- a/.idea/dataSources.xml
+++ b/.idea/dataSources.xml
@@ -0,0 +1,11 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
			
 
				+    <data-source source="LOCAL" name="horses@relentless.rocks" uuid="589bba45-cb38-44b4-8f6a-0076913eac99">
			
 
				+      <driver-ref>postgresql</driver-ref>
			
 
				+      <synchronize>true</synchronize>
			
 
				+      <jdbc-driver>org.postgresql.Driver</jdbc-driver>
			
 
				+      <jdbc-url>jdbc:postgresql://relentless.rocks:5432/horses</jdbc-url>
			
 
				+    </data-source>
			
 
				+  </component>
			
 
				+</project>
			
--- a/.idea/dictionaries/foppe.xml
+++ b/.idea/dictionaries/foppe.xml
@@ -4,8 +4,11 @@
 
				       <w>aest</w>
			
 
				       <w>aspx</w>
			
 
				       <w>beautifulsoup</w>
			
 
				+      <w>dotenv</w>
			
 
				       <w>pytz</w>
			
 
				+      <w>sratchings</w>
			
 
				       <w>unixtime</w>
			
 
				+      <w>webpage</w>
			
 
				     </words>
			
 
				   </dictionary>
			
 
				 </component>
			
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,6 @@
 
				+<component name="InspectionProjectProfileManager">
			
 
				+  <profile version="1.0">
			
 
				+    <option name="myName" value="Project Default" />
			
 
				+    <inspection_tool class="DotEnvDuplicateKeyInspection" enabled="false" level="WARNING" enabled_by_default="false" />
			
 
				+  </profile>
			
 
				+</component>
			
--- a/_bs.py
+++ b/_bs.py
@@ -5,6 +5,7 @@ from pytz import timezone
 
				 import model
			
 
				 import collections
			
 
				 # import pytz
			
 
				+from pprint import pprint
			
 
				 
			
 
				 
			
 
				 """
			
@@ -13,11 +14,12 @@ This module contains custom methods based on bs4.beautifulsoup to analyze data
 
				 
			
 
				 base_url = 'https://racingaustralia.horse/FreeFields/'
			
 
				 Venue = collections.namedtuple('Venue', 'state, name')
			
 
				-RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'scratchings_url'))
			
 
				+RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'scratchings_url'))
			
 
				 # noinspection PyProtectedMember,PyUnresolvedReferences
			
 
				 RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
			
 
				     'scratchings_latest_datetime', 'scratchings_latest_unixtime',
			
 
				     'scratchings_close_datetime', 'scratchings_close_unixtime'))
			
 
				+Scratching = collections.namedtuple('Scratching', 'venue date race horse')
			
 
				 
			
 
				 
			
 
				 def get_today_row(this_text):
			
@@ -30,37 +32,40 @@ def get_today_row(this_text):
 
				     """
			
 
				     this_soup = BeautifulSoup(this_text, 'html.parser')
			
 
				     rows = this_soup.select('tr.rows')
			
 
				-    my_row = rows[2]
			
 
				-    cells = my_row.select('td')
			
 
				-    i = 0
			
 
				-    states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
			
 
				+    # print('len(rows) {}'.format(len(rows)))
			
 
				     all_race_days = []
			
 
				-    day = 'Unknown'
			
 
				-    for cell in cells:
			
 
				-        if i == 0:
			
 
				-            # First cell contains date information
			
 
				-            day = cell.find('span').getText()
			
 
				-            # print("date: {}".format(day))
			
 
				+    for day in range(len(rows)):
			
 
				+        my_row = rows[day]
			
 
				+        cells = my_row.select('td')
			
 
				+        i = 0
			
 
				+        states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
			
 
				+        day = 'Unknown'
			
 
				+        for cell in cells:
			
 
				+            if i == 0:
			
 
				+                # First cell contains date information
			
 
				+                day = cell.find('span').getText()
			
 
				+                # print("date: {}".format(day))
			
 
				+                i += 1
			
 
				+                continue
			
 
				+            venue_text = cell.find('p').getText().strip()
			
 
				+            if len(venue_text) > 0:
			
 
				+                # Cell is not empty
			
 
				+                print(venue_text)
			
 
				+                this_a = cell.findAll('a')  # .get('href')
			
 
				+                for a in this_a:
			
 
				+                    # There may be several links in a cell (which represents a state)
			
 
				+                    venue_name = a.getText().strip()
			
 
				+                    this_venue = Venue(states[i - 1], venue_name)
			
 
				+                    date_string = day
			
 
				+                    this_url = a.get('href')
			
 
				+                    if this_url:
			
 
				+                        # Create the Scratchings URL by substitution
			
 
				+                        scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
			
 
				+                        scratchings_url = base_url + scratchings_url
			
 
				+                        this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
			
 
				+                                                     '1970-01-01', scratchings_url)
			
 
				+                        all_race_days.append(this_race_day)
			
 
				             i += 1
			
 
				-            continue
			
 
				-        venue_text = cell.find('p').getText().strip()
			
 
				-        if len(venue_text) > 0:
			
 
				-            # Cell is not empty
			
 
				-            print(venue_text)
			
 
				-            this_a = cell.findAll('a')  # .get('href')
			
 
				-            for a in this_a:
			
 
				-                # There may be several links in a cell (which represents a state)
			
 
				-                venue_name = a.getText().strip()
			
 
				-                this_venue = Venue(states[i - 1], venue_name)
			
 
				-                date_string = day
			
 
				-                this_url = a.get('href')
			
 
				-                if this_url:
			
 
				-                    # Create the Scratchings URL by substitution
			
 
				-                    scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
			
 
				-                    scratchings_url = base_url + scratchings_url
			
 
				-                    this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, scratchings_url)
			
 
				-                    all_race_days.append(this_race_day)
			
 
				-        i += 1
			
 
				     return all_race_days
			
 
				 
			
 
				 
			
@@ -75,9 +80,19 @@ def get_meta_data(this_data, this_venue):
 
				     :return:
			
 
				     """
			
 
				     this_soup = BeautifulSoup(this_data, 'html.parser')
			
 
				-    this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
			
 
				+    early = this_soup.select('div.large')
			
 
				+    if early:
			
 
				+        print(early.get_text())
			
 
				+    if early and 'not currently available' in early.get_text():
			
 
				+        print(early.get_text())
			
 
				+        return
			
 
				+    try:
			
 
				+        this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
			
 
				+    except IndexError:
			
 
				+        return
			
 
				     last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
			
 
				     close_regex = re.compile('Scratching close: (.+? AEST)')
			
 
				+    # The times tuple is filled with a dateTime string then a unixtime (seconds since 1970)
			
 
				     times = ['', 0, '', 0]
			
 
				     time_format = '%a %d-%b-%y %I:%M%p'
			
 
				     aest = timezone('Australia/Brisbane')
			
@@ -100,6 +115,52 @@ def get_meta_data(this_data, this_venue):
 
				             # print(aest.localize(l_time))
			
 
				             times[3] = model.convert_to_unixtime(aest.localize(l_time))
			
 
				             # print(times[3])
			
 
				-    race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string, this_venue.scratchings_url,
			
 
				-                       times[0], times[1], times[2], times[3])
			
 
				+    # The RaceDAy namedTuple is created and filled
			
 
				+    race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string,
			
 
				+                       datetime.date.fromtimestamp(times[3]+12*60*60),
			
 
				+                       this_venue.scratchings_url, times[0], times[1], times[2], times[3])
			
 
				     return race_day
			
 
				+
			
 
				+
			
 
				+def process_scratchings(this_data, this_venue):
			
 
				+    this_soup = BeautifulSoup(this_data, 'html.parser')
			
 
				+    try:
			
 
				+        this_scr = this_soup.select('div.scratchings')[0]
			
 
				+    except IndexError:
			
 
				+        return
			
 
				+    scratchings_count = this_scr.select('table')[0].select('tr')[2].select('td')[3].getText()
			
 
				+    print('{}: scratchings_count {}'.format(this_venue.name, scratchings_count))
			
 
				+    header = this_scr.select('h3', text=re.compile('Scratchings'))[0]
			
 
				+    div = header.findNext('table')
			
 
				+    old_race = 0
			
 
				+    race = 0
			
 
				+    scratchings = []
			
 
				+    for text in div.stripped_strings:
			
 
				+        if text[:5] == 'Race ':
			
 
				+            match = re.search('^Race ([0-9]+):$', text)
			
 
				+            if match:
			
 
				+                try:
			
 
				+                    race = int(match.group(1))
			
 
				+                except ValueError:
			
 
				+                    # This will happily fail in the next assert
			
 
				+                    race = 0
			
 
				+                assert race > old_race, 'race {} ! > old_race {}'.format(race, old_race)
			
 
				+                old_race = race
			
 
				+            continue
			
 
				+        if text[0] == '(':
			
 
				+            continue
			
 
				+        if len(text) > 0:
			
 
				+            if text[0:10] == 'There are ':
			
 
				+                continue
			
 
				+            try:
			
 
				+                int(text[0])
			
 
				+            except ValueError:
			
 
				+                print('First character in line: {}'.format(text[0]))
			
 
				+                print('The start of the offending line is: {}'.format(text[0:10]))
			
 
				+                continue
			
 
				+            temp_list = Scratching(this_venue.name, this_venue.date, race, text)
			
 
				+            scratchings.append(temp_list)
			
 
				+    assert len(scratchings) == int(scratchings_count), 'len(scratchings) {} == scratchings_count {}'.format(
			
 
				+        len(scratchings), scratchings_count)
			
 
				+    pprint(scratchings)
			
 
				+    return scratchings
			
--- a/_html.py
+++ b/_html.py
@@ -7,6 +7,11 @@ This module contains methods to retrieve pages
 
				 
			
 
				 
			
 
				 def get_page(this_url):
			
 
				-    # url = 'https://twitter.com/TheOnion'
			
 
				+    """
			
 
				+    Simple utility gets the contents of a webpage
			
 
				+    @TODO Add Try/Catch blocks. Handle http errors.
			
 
				+    :param this_url:
			
 
				+    :return:
			
 
				+    """
			
 
				     data = requests.get(this_url)
			
 
				     return data.text
			
--- a/database.py
+++ b/database.py
@@ -0,0 +1,14 @@
 
				+import dotenv
			
 
				+import os
			
 
				+import psycopg2
			
 
				+
			
 
				+
			
 
				+# Develop only
			
 
				+dotenv.load_dotenv()
			
 
				+
			
 
				+host = os.environ["STOCK_DB_HOST"]
			
 
				+port = os.environ["STOCK_DB_PORT"]
			
 
				+database = os.environ["STOCK_DB_DATABASE"]
			
 
				+user = os.environ["STOCK_DB_USER"]
			
 
				+password = os.environ["STOCK_DB_PASSWD"]
			
 
				+db = psycopg2.connect(host=host, port=port, database=database, user=user, password=password)
			
--- a/main.py
+++ b/main.py
@@ -1,14 +1,74 @@
 
				+import psycopg2.extras
			
 
				 import model
			
 
				-# import os
			
 
				 from pprint import pprint
			
 
				-
			
 
				+import time
			
 
				+import database
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    # current_timezone = os.environ['TZ']
			
 
				+    start = time.time()
			
 
				+
			
 
				+    db = database.db
			
 
				 
			
 
				     race_days_global = model.scrape_main_page()
			
 
				+    interim = time.time()
			
 
				+    print('interim 1 {}'.format(interim - start))
			
 
				     # pprint(race_days_global)
			
 
				     race_days = []
			
 
				+    raw_data_dict = {}
			
 
				     for race_day in race_days_global:
			
 
				-        race_days.append(model.get_scratchings(race_day))
			
 
				+        raw_data = model.get_raw_scratchings(race_day)
			
 
				+        race_day_details = model.process_raw_data(raw_data, race_day)
			
 
				+        race_days.append(race_day_details)
			
 
				+        raw_data_dict[race_day.name] = raw_data
			
 
				+    interim = time.time()
			
 
				+    print('interim 2 {}'.format(interim - start))
			
 
				     pprint(race_days)
			
 
				+    cursor = db.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
			
 
				+    for race_day in race_days:
			
 
				+        if not raw_data_dict or not race_day:
			
 
				+            # raw_data_dict may be empty when there is no data available (yet)
			
 
				+            continue
			
 
				+        raw_data = raw_data_dict[race_day.name]
			
 
				+        scratchings = model.get_scratching_details(raw_data, race_day)
			
 
				+        if not scratchings:
			
 
				+            # model.get_scratchings_details may return empty
			
 
				+            continue
			
 
				+        # retrieve previous stored scratching for this venue / day
			
 
				+        query = "SELECT * FROM horses WHERE venue = %s AND race_date = %s;"
			
 
				+
			
 
				+        cursor.execute(query, (race_day.name, race_day.date))
			
 
				+        db_data = cursor.fetchall()
			
 
				+
			
 
				+        # compare retrieved scratchings with new data
			
 
				+        match = False
			
 
				+        for scratching in scratchings:
			
 
				+            for row in db_data:
			
 
				+                # print(row)
			
 
				+                if (
			
 
				+                        scratching.date == row.race_date and
			
 
				+                        scratching.venue == row.venue and
			
 
				+                        scratching.race == row.race and
			
 
				+                        scratching.horse == row.horse
			
 
				+                ):
			
 
				+                    message = 'Horse found: date = {}, venue = {}, race = {}, horse = {}'.format(scratching.date,
			
 
				+                                                                                                 scratching.venue,
			
 
				+                                                                                                 scratching.race,
			
 
				+                                                                                                 scratching.horse)
			
 
				+                    print(message)
			
 
				+                    match = True
			
 
				+            if not match:
			
 
				+                # report new scratching
			
 
				+                message = 'New scratching: {} {} race {} horse {}'.format(scratching.date, scratching.venue,
			
 
				+                                                                          scratching.race, scratching.horse)
			
 
				+                print(message)
			
 
				+                # store new scratching
			
 
				+                query = "INSERT INTO horses(venue, race_date, race, horse) VALUES(%s, %s, %s, %s)"
			
 
				+                cursor.execute(query, (scratching.venue, scratching.date,
			
 
				+                                       scratching.race, scratching.horse))
			
 
				+            db.commit()
			
 
				+            match = False
			
 
				+    cursor.close()
			
 
				+    db.close()
			
 
				+
			
 
				+    interim = time.time()
			
 
				+    print('interim 3 {}'.format(interim - start))
			
--- a/model.py
+++ b/model.py
@@ -2,6 +2,8 @@ import _html
 
				 import _bs
			
 
				 import pytz
			
 
				 import datetime
			
 
				+# import time
			
 
				+import psycopg2.extras
			
 
				 
			
 
				 
			
 
				 """
			
@@ -26,14 +28,35 @@ def scrape_main_page():
 
				     return venues_all
			
 
				 
			
 
				 
			
 
				-def get_scratchings(this_venue):
			
 
				-    this_data = _html.get_page(this_venue[3])
			
 
				-    # print(this_data)
			
 
				-    race_day_info = _bs.get_meta_data(this_data, this_venue)
			
 
				+def get_raw_scratchings(this_venue):
			
 
				+    this_raw_data = _html.get_page(this_venue.scratchings_url)
			
 
				+    return this_raw_data
			
 
				+
			
 
				+
			
 
				+def process_raw_data(this_raw_data, this_venue):
			
 
				+    """
			
 
				+    Processes the raw data from the Scratchings page to obtain meta data.
			
 
				+    this_venue is passed to _bs.process_scratchings() to create the inherited namedTuple
			
 
				+    :param this_raw_data:
			
 
				+    :param this_venue:
			
 
				+    :return:
			
 
				+    """
			
 
				+    race_day_info = _bs.get_meta_data(this_raw_data, this_venue)
			
 
				     return race_day_info
			
 
				 
			
 
				 
			
 
				+def get_scratching_details(this_raw_data, this_venue):
			
 
				+    # this_data = _html.get_page(this_venue.scratchings_url)
			
 
				+    scratchings_info = _bs.process_scratchings(this_raw_data, this_venue)
			
 
				+    return scratchings_info
			
 
				+
			
 
				+
			
 
				 def convert_to_unixtime(dt_object):
			
 
				+    """
			
 
				+    Simple utility function that returns the unixtime from a timezone aware dateTime object
			
 
				+    :param dt_object:
			
 
				+    :return:
			
 
				+    """
			
 
				     utc = pytz.UTC
			
 
				     d = dt_object.astimezone(utc)
			
 
				 
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,8 @@ beautifulsoup4==4.7.1
 
				 certifi==2019.6.16
			
 
				 chardet==3.0.4
			
 
				 idna==2.8
			
 
				+psycopg2==2.8.3
			
 
				+python-dotenv==0.10.3
			
 
				 pytz==2019.1
			
 
				 requests==2.22.0
			
 
				 soupsieve==1.9.1
			
--- a/view.py
+++ b/view.py
@@ -0,0 +1,21 @@
 
				+import os
			
 
				+import datetime
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+def broadcast(this_message):
			
 
				+    """
			
 
				+
			
 
				+    :param this_message:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # development only
			
 
				+    # load_dotenv()
			
 
				+    url = os.environ["BROADCAST_URL"]
			
 
				+    this_time = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M')
			
 
				+    json = {'content': this_time+': '+this_message}
			
 
				+    response = requests.post(url, json=json)
			
 
				+    if response.status_code in [200, 204]:
			
 
				+        print("Webhook executed")
			
 
				+    else:
			
 
				+        print("status code {}: {}".format(response.status_code, response.content.decode("utf-8")))