hace 6 años · 8f8ac0c518
--- a/.idea/dataSources.xml
+++ b/.idea/dataSources.xml
@@ -0,0 +1,11 @@
 
																+<?xml version="1.0" encoding="UTF-8"?>
															
 
																+<project version="4">
															
 
																+  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
															
 
																+    <data-source source="LOCAL" name="horses@relentless.rocks" uuid="589bba45-cb38-44b4-8f6a-0076913eac99">
															
 
																+      <driver-ref>postgresql</driver-ref>
															
 
																+      <synchronize>true</synchronize>
															
 
																+      <jdbc-driver>org.postgresql.Driver</jdbc-driver>
															
 
																+      <jdbc-url>jdbc:postgresql://relentless.rocks:5432/horses</jdbc-url>
															
 
																+    </data-source>
															
 
																+  </component>
															
 
																+</project>
															
--- a/.idea/dictionaries/foppe.xml
+++ b/.idea/dictionaries/foppe.xml
@@ -4,8 +4,11 @@
 
																       <w>aest</w>
															
 
																       <w>aspx</w>
															
 
																       <w>beautifulsoup</w>
															
 
																+      <w>dotenv</w>
															
 
																       <w>pytz</w>
															
 
																+      <w>sratchings</w>
															
 
																       <w>unixtime</w>
															
 
																+      <w>webpage</w>
															
 
																     </words>
															
 
																   </dictionary>
															
 
																 </component>
															
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,6 @@
 
																+<component name="InspectionProjectProfileManager">
															
 
																+  <profile version="1.0">
															
 
																+    <option name="myName" value="Project Default" />
															
 
																+    <inspection_tool class="DotEnvDuplicateKeyInspection" enabled="false" level="WARNING" enabled_by_default="false" />
															
 
																+  </profile>
															
 
																+</component>
															
--- a/_bs.py
+++ b/_bs.py
@@ -5,6 +5,7 @@ from pytz import timezone
 
																 import model
															
 
																 import collections
															
 
																 # import pytz
															
 
																+from pprint import pprint
															
 
																 """
															
@@ -13,11 +14,12 @@ This module contains custom methods based on bs4.beautifulsoup to analyze data
 
																 base_url = 'https://racingaustralia.horse/FreeFields/'
															
 
																 Venue = collections.namedtuple('Venue', 'state, name')
															
 
																-RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'scratchings_url'))
															
 
																+RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'scratchings_url'))
															
 
																 # noinspection PyProtectedMember,PyUnresolvedReferences
															
 
																 RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
															
 
																     'scratchings_latest_datetime', 'scratchings_latest_unixtime',
															
 
																     'scratchings_close_datetime', 'scratchings_close_unixtime'))
															
 
																+Scratching = collections.namedtuple('Scratching', 'venue date race horse')
															
 
																 def get_today_row(this_text):
															
@@ -30,37 +32,40 @@ def get_today_row(this_text):
 
																     """
															
 
																     this_soup = BeautifulSoup(this_text, 'html.parser')
															
 
																     rows = this_soup.select('tr.rows')
															
 
																-    my_row = rows[2]
															
 
																-    cells = my_row.select('td')
															
 
																-    i = 0
															
 
																-    states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
															
 
																+    # print('len(rows) {}'.format(len(rows)))
															
 
																     all_race_days = []
															
 
																-    day = 'Unknown'
															
 
																-    for cell in cells:
															
 
																-        if i == 0:
															
 
																-            # First cell contains date information
															
 
																-            day = cell.find('span').getText()
															
 
																-            # print("date: {}".format(day))
															
 
																+    for day in range(len(rows)):
															
 
																+        my_row = rows[day]
															
 
																+        cells = my_row.select('td')
															
 
																+        i = 0
															
 
																+        states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
															
 
																+        day = 'Unknown'
															
 
																+        for cell in cells:
															
 
																+            if i == 0:
															
 
																+                # First cell contains date information
															
 
																+                day = cell.find('span').getText()
															
 
																+                # print("date: {}".format(day))
															
 
																+                i += 1
															
 
																+                continue
															
 
																+            venue_text = cell.find('p').getText().strip()
															
 
																+            if len(venue_text) > 0:
															
 
																+                # Cell is not empty
															
 
																+                print(venue_text)
															
 
																+                this_a = cell.findAll('a')  # .get('href')
															
 
																+                for a in this_a:
															
 
																+                    # There may be several links in a cell (which represents a state)
															
 
																+                    venue_name = a.getText().strip()
															
 
																+                    this_venue = Venue(states[i - 1], venue_name)
															
 
																+                    date_string = day
															
 
																+                    this_url = a.get('href')
															
 
																+                    if this_url:
															
 
																+                        # Create the Scratchings URL by substitution
															
 
																+                        scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
															
 
																+                        scratchings_url = base_url + scratchings_url
															
 
																+                        this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
															
 
																+                                                     '1970-01-01', scratchings_url)
															
 
																+                        all_race_days.append(this_race_day)
															
 
																             i += 1
															
 
																-            continue
															
 
																-        venue_text = cell.find('p').getText().strip()
															
 
																-        if len(venue_text) > 0:
															
 
																-            # Cell is not empty
															
 
																-            print(venue_text)
															
 
																-            this_a = cell.findAll('a')  # .get('href')
															
 
																-            for a in this_a:
															
 
																-                # There may be several links in a cell (which represents a state)
															
 
																-                venue_name = a.getText().strip()
															
 
																-                this_venue = Venue(states[i - 1], venue_name)
															
 
																-                date_string = day
															
 
																-                this_url = a.get('href')
															
 
																-                if this_url:
															
 
																-                    # Create the Scratchings URL by substitution
															
 
																-                    scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
															
 
																-                    scratchings_url = base_url + scratchings_url
															
 
																-                    this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, scratchings_url)
															
 
																-                    all_race_days.append(this_race_day)
															
 
																-        i += 1
															
 
																     return all_race_days
															
@@ -75,9 +80,19 @@ def get_meta_data(this_data, this_venue):
 
																     :return:
															
 
																     """
															
 
																     this_soup = BeautifulSoup(this_data, 'html.parser')
															
 
																-    this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
															
 
																+    early = this_soup.select('div.large')
															
 
																+    if early:
															
 
																+        print(early.get_text())
															
 
																+    if early and 'not currently available' in early.get_text():
															
 
																+        print(early.get_text())
															
 
																+        return
															
 
																+    try:
															
 
																+        this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
															
 
																+    except IndexError:
															
 
																+        return
															
 
																     last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
															
 
																     close_regex = re.compile('Scratching close: (.+? AEST)')
															
 
																+    # The times tuple is filled with a dateTime string then a unixtime (seconds since 1970)
															
 
																     times = ['', 0, '', 0]
															
 
																     time_format = '%a %d-%b-%y %I:%M%p'
															
 
																     aest = timezone('Australia/Brisbane')
															
@@ -100,6 +115,52 @@ def get_meta_data(this_data, this_venue):
 
																             # print(aest.localize(l_time))
															
 
																             times[3] = model.convert_to_unixtime(aest.localize(l_time))
															
 
																             # print(times[3])
															
 
																-    race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string, this_venue.scratchings_url,
															
 
																-                       times[0], times[1], times[2], times[3])
															
 
																+    # The RaceDAy namedTuple is created and filled
															
 
																+    race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string,
															
 
																+                       datetime.date.fromtimestamp(times[3]+12*60*60),
															
 
																+                       this_venue.scratchings_url, times[0], times[1], times[2], times[3])
															
 
																     return race_day
															
 
																+
															
 
																+
															
 
																+def process_scratchings(this_data, this_venue):
															
 
																+    this_soup = BeautifulSoup(this_data, 'html.parser')
															
 
																+    try:
															
 
																+        this_scr = this_soup.select('div.scratchings')[0]
															
 
																+    except IndexError:
															
 
																+        return
															
 
																+    scratchings_count = this_scr.select('table')[0].select('tr')[2].select('td')[3].getText()
															
 
																+    print('{}: scratchings_count {}'.format(this_venue.name, scratchings_count))
															
 
																+    header = this_scr.select('h3', text=re.compile('Scratchings'))[0]
															
 
																+    div = header.findNext('table')
															
 
																+    old_race = 0
															
 
																+    race = 0
															
 
																+    scratchings = []
															
 
																+    for text in div.stripped_strings:
															
 
																+        if text[:5] == 'Race ':
															
 
																+            match = re.search('^Race ([0-9]+):$', text)
															
 
																+            if match:
															
 
																+                try:
															
 
																+                    race = int(match.group(1))
															
 
																+                except ValueError:
															
 
																+                    # This will happily fail in the next assert
															
 
																+                    race = 0
															
 
																+                assert race > old_race, 'race {} ! > old_race {}'.format(race, old_race)
															
 
																+                old_race = race
															
 
																+            continue
															
 
																+        if text[0] == '(':
															
 
																+            continue
															
 
																+        if len(text) > 0:
															
 
																+            if text[0:10] == 'There are ':
															
 
																+                continue
															
 
																+            try:
															
 
																+                int(text[0])
															
 
																+            except ValueError:
															
 
																+                print('First character in line: {}'.format(text[0]))
															
 
																+                print('The start of the offending line is: {}'.format(text[0:10]))
															
 
																+                continue
															
 
																+            temp_list = Scratching(this_venue.name, this_venue.date, race, text)
															
 
																+            scratchings.append(temp_list)
															
 
																+    assert len(scratchings) == int(scratchings_count), 'len(scratchings) {} == scratchings_count {}'.format(
															
 
																+        len(scratchings), scratchings_count)
															
 
																+    pprint(scratchings)
															
 
																+    return scratchings
															
--- a/_html.py
+++ b/_html.py
@@ -7,6 +7,11 @@ This module contains methods to retrieve pages
 
																 def get_page(this_url):
															
 
																-    # url = 'https://twitter.com/TheOnion'
															
 
																+    """
															
 
																+    Simple utility gets the contents of a webpage
															
 
																+    @TODO Add Try/Catch blocks. Handle http errors.
															
 
																+    :param this_url:
															
 
																+    :return:
															
 
																+    """
															
 
																     data = requests.get(this_url)
															
 
																     return data.text
															
--- a/database.py
+++ b/database.py
@@ -0,0 +1,14 @@
 
																+import dotenv
															
 
																+import os
															
 
																+import psycopg2
															
 
																+
															
 
																+
															
 
																+# Develop only
															
 
																+dotenv.load_dotenv()
															
 
																+
															
 
																+host = os.environ["STOCK_DB_HOST"]
															
 
																+port = os.environ["STOCK_DB_PORT"]
															
 
																+database = os.environ["STOCK_DB_DATABASE"]
															
 
																+user = os.environ["STOCK_DB_USER"]
															
 
																+password = os.environ["STOCK_DB_PASSWD"]
															
 
																+db = psycopg2.connect(host=host, port=port, database=database, user=user, password=password)
															
--- a/main.py
+++ b/main.py
@@ -1,14 +1,74 @@
 
																+import psycopg2.extras
															
 
																 import model
															
 
																-# import os
															
 
																 from pprint import pprint
															
 
																-
															
 
																+import time
															
 
																+import database
															
 
																 if __name__ == '__main__':
															
 
																-    # current_timezone = os.environ['TZ']
															
 
																+    start = time.time()
															
 
																+
															
 
																+    db = database.db
															
 
																     race_days_global = model.scrape_main_page()
															
 
																+    interim = time.time()
															
 
																+    print('interim 1 {}'.format(interim - start))
															
 
																     # pprint(race_days_global)
															
 
																     race_days = []
															
 
																+    raw_data_dict = {}
															
 
																     for race_day in race_days_global:
															
 
																-        race_days.append(model.get_scratchings(race_day))
															
 
																+        raw_data = model.get_raw_scratchings(race_day)
															
 
																+        race_day_details = model.process_raw_data(raw_data, race_day)
															
 
																+        race_days.append(race_day_details)
															
 
																+        raw_data_dict[race_day.name] = raw_data
															
 
																+    interim = time.time()
															
 
																+    print('interim 2 {}'.format(interim - start))
															
 
																     pprint(race_days)
															
 
																+    cursor = db.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
															
 
																+    for race_day in race_days:
															
 
																+        if not raw_data_dict or not race_day:
															
 
																+            # raw_data_dict may be empty when there is no data available (yet)
															
 
																+            continue
															
 
																+        raw_data = raw_data_dict[race_day.name]
															
 
																+        scratchings = model.get_scratching_details(raw_data, race_day)
															
 
																+        if not scratchings:
															
 
																+            # model.get_scratchings_details may return empty
															
 
																+            continue
															
 
																+        # retrieve previous stored scratching for this venue / day
															
 
																+        query = "SELECT * FROM horses WHERE venue = %s AND race_date = %s;"
															
 
																+
															
 
																+        cursor.execute(query, (race_day.name, race_day.date))
															
 
																+        db_data = cursor.fetchall()
															
 
																+
															
 
																+        # compare retrieved scratchings with new data
															
 
																+        match = False
															
 
																+        for scratching in scratchings:
															
 
																+            for row in db_data:
															
 
																+                # print(row)
															
 
																+                if (
															
 
																+                        scratching.date == row.race_date and
															
 
																+                        scratching.venue == row.venue and
															
 
																+                        scratching.race == row.race and
															
 
																+                        scratching.horse == row.horse
															
 
																+                ):
															
 
																+                    message = 'Horse found: date = {}, venue = {}, race = {}, horse = {}'.format(scratching.date,
															
 
																+                                                                                                 scratching.venue,
															
 
																+                                                                                                 scratching.race,
															
 
																+                                                                                                 scratching.horse)
															
 
																+                    print(message)
															
 
																+                    match = True
															
 
																+            if not match:
															
 
																+                # report new scratching
															
 
																+                message = 'New scratching: {} {} race {} horse {}'.format(scratching.date, scratching.venue,
															
 
																+                                                                          scratching.race, scratching.horse)
															
 
																+                print(message)
															
 
																+                # store new scratching
															
 
																+                query = "INSERT INTO horses(venue, race_date, race, horse) VALUES(%s, %s, %s, %s)"
															
 
																+                cursor.execute(query, (scratching.venue, scratching.date,
															
 
																+                                       scratching.race, scratching.horse))
															
 
																+            db.commit()
															
 
																+            match = False
															
 
																+    cursor.close()
															
 
																+    db.close()
															
 
																+
															
 
																+    interim = time.time()
															
 
																+    print('interim 3 {}'.format(interim - start))
															
--- a/model.py
+++ b/model.py
@@ -2,6 +2,8 @@ import _html
 
																 import _bs
															
 
																 import pytz
															
 
																 import datetime
															
 
																+# import time
															
 
																+import psycopg2.extras
															
 
																 """
															
@@ -26,14 +28,35 @@ def scrape_main_page():
 
																     return venues_all
															
 
																-def get_scratchings(this_venue):
															
 
																-    this_data = _html.get_page(this_venue[3])
															
 
																-    # print(this_data)
															
 
																-    race_day_info = _bs.get_meta_data(this_data, this_venue)
															
 
																+def get_raw_scratchings(this_venue):
															
 
																+    this_raw_data = _html.get_page(this_venue.scratchings_url)
															
 
																+    return this_raw_data
															
 
																+
															
 
																+
															
 
																+def process_raw_data(this_raw_data, this_venue):
															
 
																+    """
															
 
																+    Processes the raw data from the Scratchings page to obtain meta data.
															
 
																+    this_venue is passed to _bs.process_scratchings() to create the inherited namedTuple
															
 
																+    :param this_raw_data:
															
 
																+    :param this_venue:
															
 
																+    :return:
															
 
																+    """
															
 
																+    race_day_info = _bs.get_meta_data(this_raw_data, this_venue)
															
 
																     return race_day_info
															
 
																+def get_scratching_details(this_raw_data, this_venue):
															
 
																+    # this_data = _html.get_page(this_venue.scratchings_url)
															
 
																+    scratchings_info = _bs.process_scratchings(this_raw_data, this_venue)
															
 
																+    return scratchings_info
															
 
																+
															
 
																+
															
 
																 def convert_to_unixtime(dt_object):
															
 
																+    """
															
 
																+    Simple utility function that returns the unixtime from a timezone aware dateTime object
															
 
																+    :param dt_object:
															
 
																+    :return:
															
 
																+    """
															
 
																     utc = pytz.UTC
															
 
																     d = dt_object.astimezone(utc)
															
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,8 @@ beautifulsoup4==4.7.1
 
																 certifi==2019.6.16
															
 
																 chardet==3.0.4
															
 
																 idna==2.8
															
 
																+psycopg2==2.8.3
															
 
																+python-dotenv==0.10.3
															
 
																 pytz==2019.1
															
 
																 requests==2.22.0
															
 
																 soupsieve==1.9.1
															
--- a/view.py
+++ b/view.py
@@ -0,0 +1,21 @@
 
																+import os
															
 
																+import datetime
															
 
																+import requests
															
 
																+
															
 
																+
															
 
																+def broadcast(this_message):
															
 
																+    """
															
 
																+
															
 
																+    :param this_message:
															
 
																+    :return:
															
 
																+    """
															
 
																+    # development only
															
 
																+    # load_dotenv()
															
 
																+    url = os.environ["BROADCAST_URL"]
															
 
																+    this_time = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M')
															
 
																+    json = {'content': this_time+': '+this_message}
															
 
																+    response = requests.post(url, json=json)
															
 
																+    if response.status_code in [200, 204]:
															
 
																+        print("Webhook executed")
															
 
																+    else:
															
 
																+        print("status code {}: {}".format(response.status_code, response.content.decode("utf-8")))