Foppe Hemminga 6 lat temu
rodzic
commit
8f8ac0c518
10 zmienionych plików z 248 dodań i 42 usunięć
  1. 11 0
      .idea/dataSources.xml
  2. 3 0
      .idea/dictionaries/foppe.xml
  3. 6 0
      .idea/inspectionProfiles/Project_Default.xml
  4. 94 33
      _bs.py
  5. 6 1
      _html.py
  6. 14 0
      database.py
  7. 64 4
      main.py
  8. 27 4
      model.py
  9. 2 0
      requirements.txt
  10. 21 0
      view.py

+ 11 - 0
.idea/dataSources.xml

@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
+    <data-source source="LOCAL" name="horses@relentless.rocks" uuid="589bba45-cb38-44b4-8f6a-0076913eac99">
+      <driver-ref>postgresql</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>org.postgresql.Driver</jdbc-driver>
+      <jdbc-url>jdbc:postgresql://relentless.rocks:5432/horses</jdbc-url>
+    </data-source>
+  </component>
+</project>

+ 3 - 0
.idea/dictionaries/foppe.xml

@@ -4,8 +4,11 @@
       <w>aest</w>
       <w>aspx</w>
       <w>beautifulsoup</w>
+      <w>dotenv</w>
       <w>pytz</w>
+      <w>sratchings</w>
       <w>unixtime</w>
+      <w>webpage</w>
     </words>
   </dictionary>
 </component>

+ 6 - 0
.idea/inspectionProfiles/Project_Default.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="DotEnvDuplicateKeyInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+  </profile>
+</component>

+ 94 - 33
_bs.py

@@ -5,6 +5,7 @@ from pytz import timezone
 import model
 import collections
 # import pytz
+from pprint import pprint
 
 
 """
@@ -13,11 +14,12 @@ This module contains custom methods based on bs4.beautifulsoup to analyze data
 
 base_url = 'https://racingaustralia.horse/FreeFields/'
 Venue = collections.namedtuple('Venue', 'state, name')
-RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'scratchings_url'))
+RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'scratchings_url'))
 # noinspection PyProtectedMember,PyUnresolvedReferences
 RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
     'scratchings_latest_datetime', 'scratchings_latest_unixtime',
     'scratchings_close_datetime', 'scratchings_close_unixtime'))
+Scratching = collections.namedtuple('Scratching', 'venue date race horse')
 
 
 def get_today_row(this_text):
@@ -30,37 +32,40 @@ def get_today_row(this_text):
     """
     this_soup = BeautifulSoup(this_text, 'html.parser')
     rows = this_soup.select('tr.rows')
-    my_row = rows[2]
-    cells = my_row.select('td')
-    i = 0
-    states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
+    # print('len(rows) {}'.format(len(rows)))
     all_race_days = []
-    day = 'Unknown'
-    for cell in cells:
-        if i == 0:
-            # First cell contains date information
-            day = cell.find('span').getText()
-            # print("date: {}".format(day))
+    for day in range(len(rows)):
+        my_row = rows[day]
+        cells = my_row.select('td')
+        i = 0
+        states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
+        day = 'Unknown'
+        for cell in cells:
+            if i == 0:
+                # First cell contains date information
+                day = cell.find('span').getText()
+                # print("date: {}".format(day))
+                i += 1
+                continue
+            venue_text = cell.find('p').getText().strip()
+            if len(venue_text) > 0:
+                # Cell is not empty
+                print(venue_text)
+                this_a = cell.findAll('a')  # .get('href')
+                for a in this_a:
+                    # There may be several links in a cell (which represents a state)
+                    venue_name = a.getText().strip()
+                    this_venue = Venue(states[i - 1], venue_name)
+                    date_string = day
+                    this_url = a.get('href')
+                    if this_url:
+                        # Create the Scratchings URL by substitution
+                        scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
+                        scratchings_url = base_url + scratchings_url
+                        this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
+                                                     '1970-01-01', scratchings_url)
+                        all_race_days.append(this_race_day)
             i += 1
-            continue
-        venue_text = cell.find('p').getText().strip()
-        if len(venue_text) > 0:
-            # Cell is not empty
-            print(venue_text)
-            this_a = cell.findAll('a')  # .get('href')
-            for a in this_a:
-                # There may be several links in a cell (which represents a state)
-                venue_name = a.getText().strip()
-                this_venue = Venue(states[i - 1], venue_name)
-                date_string = day
-                this_url = a.get('href')
-                if this_url:
-                    # Create the Scratchings URL by substitution
-                    scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
-                    scratchings_url = base_url + scratchings_url
-                    this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, scratchings_url)
-                    all_race_days.append(this_race_day)
-        i += 1
     return all_race_days
 
 
@@ -75,9 +80,19 @@ def get_meta_data(this_data, this_venue):
     :return:
     """
     this_soup = BeautifulSoup(this_data, 'html.parser')
-    this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
+    early = this_soup.select('div.large')
+    if early:
+        print(early.get_text())
+    if early and 'not currently available' in early.get_text():
+        print(early.get_text())
+        return
+    try:
+        this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
+    except IndexError:
+        return
     last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
     close_regex = re.compile('Scratching close: (.+? AEST)')
+    # The times tuple is filled with a dateTime string then a unixtime (seconds since 1970)
     times = ['', 0, '', 0]
     time_format = '%a %d-%b-%y %I:%M%p'
     aest = timezone('Australia/Brisbane')
@@ -100,6 +115,52 @@ def get_meta_data(this_data, this_venue):
             # print(aest.localize(l_time))
             times[3] = model.convert_to_unixtime(aest.localize(l_time))
             # print(times[3])
-    race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string, this_venue.scratchings_url,
-                       times[0], times[1], times[2], times[3])
+    # The RaceDAy namedTuple is created and filled
+    race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string,
+                       datetime.date.fromtimestamp(times[3]+12*60*60),
+                       this_venue.scratchings_url, times[0], times[1], times[2], times[3])
     return race_day
+
+
+def process_scratchings(this_data, this_venue):
+    this_soup = BeautifulSoup(this_data, 'html.parser')
+    try:
+        this_scr = this_soup.select('div.scratchings')[0]
+    except IndexError:
+        return
+    scratchings_count = this_scr.select('table')[0].select('tr')[2].select('td')[3].getText()
+    print('{}: scratchings_count {}'.format(this_venue.name, scratchings_count))
+    header = this_scr.select('h3', text=re.compile('Scratchings'))[0]
+    div = header.findNext('table')
+    old_race = 0
+    race = 0
+    scratchings = []
+    for text in div.stripped_strings:
+        if text[:5] == 'Race ':
+            match = re.search('^Race ([0-9]+):$', text)
+            if match:
+                try:
+                    race = int(match.group(1))
+                except ValueError:
+                    # This will happily fail in the next assert
+                    race = 0
+                assert race > old_race, 'race {} ! > old_race {}'.format(race, old_race)
+                old_race = race
+            continue
+        if text[0] == '(':
+            continue
+        if len(text) > 0:
+            if text[0:10] == 'There are ':
+                continue
+            try:
+                int(text[0])
+            except ValueError:
+                print('First character in line: {}'.format(text[0]))
+                print('The start of the offending line is: {}'.format(text[0:10]))
+                continue
+            temp_list = Scratching(this_venue.name, this_venue.date, race, text)
+            scratchings.append(temp_list)
+    assert len(scratchings) == int(scratchings_count), 'len(scratchings) {} == scratchings_count {}'.format(
+        len(scratchings), scratchings_count)
+    pprint(scratchings)
+    return scratchings

+ 6 - 1
_html.py

@@ -7,6 +7,11 @@ This module contains methods to retrieve pages
 
 
 def get_page(this_url):
-    # url = 'https://twitter.com/TheOnion'
+    """
+    Simple utility gets the contents of a webpage
+    @TODO Add Try/Catch blocks. Handle http errors.
+    :param this_url:
+    :return:
+    """
     data = requests.get(this_url)
     return data.text

+ 14 - 0
database.py

@@ -0,0 +1,14 @@
+import dotenv
+import os
+import psycopg2
+
+
+# Develop only
+dotenv.load_dotenv()
+
+host = os.environ["STOCK_DB_HOST"]
+port = os.environ["STOCK_DB_PORT"]
+database = os.environ["STOCK_DB_DATABASE"]
+user = os.environ["STOCK_DB_USER"]
+password = os.environ["STOCK_DB_PASSWD"]
+db = psycopg2.connect(host=host, port=port, database=database, user=user, password=password)

+ 64 - 4
main.py

@@ -1,14 +1,74 @@
+import psycopg2.extras
 import model
-# import os
 from pprint import pprint
-
+import time
+import database
 
 if __name__ == '__main__':
-    # current_timezone = os.environ['TZ']
+    start = time.time()
+
+    db = database.db
 
     race_days_global = model.scrape_main_page()
+    interim = time.time()
+    print('interim 1 {}'.format(interim - start))
     # pprint(race_days_global)
     race_days = []
+    raw_data_dict = {}
     for race_day in race_days_global:
-        race_days.append(model.get_scratchings(race_day))
+        raw_data = model.get_raw_scratchings(race_day)
+        race_day_details = model.process_raw_data(raw_data, race_day)
+        race_days.append(race_day_details)
+        raw_data_dict[race_day.name] = raw_data
+    interim = time.time()
+    print('interim 2 {}'.format(interim - start))
     pprint(race_days)
+    cursor = db.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
+    for race_day in race_days:
+        if not raw_data_dict or not race_day:
+            # raw_data_dict may be empty when there is no data available (yet)
+            continue
+        raw_data = raw_data_dict[race_day.name]
+        scratchings = model.get_scratching_details(raw_data, race_day)
+        if not scratchings:
+            # model.get_scratchings_details may return empty
+            continue
+        # retrieve previous stored scratching for this venue / day
+        query = "SELECT * FROM horses WHERE venue = %s AND race_date = %s;"
+
+        cursor.execute(query, (race_day.name, race_day.date))
+        db_data = cursor.fetchall()
+
+        # compare retrieved scratchings with new data
+        match = False
+        for scratching in scratchings:
+            for row in db_data:
+                # print(row)
+                if (
+                        scratching.date == row.race_date and
+                        scratching.venue == row.venue and
+                        scratching.race == row.race and
+                        scratching.horse == row.horse
+                ):
+                    message = 'Horse found: date = {}, venue = {}, race = {}, horse = {}'.format(scratching.date,
+                                                                                                 scratching.venue,
+                                                                                                 scratching.race,
+                                                                                                 scratching.horse)
+                    print(message)
+                    match = True
+            if not match:
+                # report new scratching
+                message = 'New scratching: {} {} race {} horse {}'.format(scratching.date, scratching.venue,
+                                                                          scratching.race, scratching.horse)
+                print(message)
+                # store new scratching
+                query = "INSERT INTO horses(venue, race_date, race, horse) VALUES(%s, %s, %s, %s)"
+                cursor.execute(query, (scratching.venue, scratching.date,
+                                       scratching.race, scratching.horse))
+            db.commit()
+            match = False
+    cursor.close()
+    db.close()
+
+    interim = time.time()
+    print('interim 3 {}'.format(interim - start))

+ 27 - 4
model.py

@@ -2,6 +2,8 @@ import _html
 import _bs
 import pytz
 import datetime
+# import time
+import psycopg2.extras
 
 
 """
@@ -26,14 +28,35 @@ def scrape_main_page():
     return venues_all
 
 
-def get_scratchings(this_venue):
-    this_data = _html.get_page(this_venue[3])
-    # print(this_data)
-    race_day_info = _bs.get_meta_data(this_data, this_venue)
+def get_raw_scratchings(this_venue):
+    this_raw_data = _html.get_page(this_venue.scratchings_url)
+    return this_raw_data
+
+
+def process_raw_data(this_raw_data, this_venue):
+    """
+    Processes the raw data from the Scratchings page to obtain meta data.
+    this_venue is passed to _bs.process_scratchings() to create the inherited namedTuple
+    :param this_raw_data:
+    :param this_venue:
+    :return:
+    """
+    race_day_info = _bs.get_meta_data(this_raw_data, this_venue)
     return race_day_info
 
 
+def get_scratching_details(this_raw_data, this_venue):
+    # this_data = _html.get_page(this_venue.scratchings_url)
+    scratchings_info = _bs.process_scratchings(this_raw_data, this_venue)
+    return scratchings_info
+
+
 def convert_to_unixtime(dt_object):
+    """
+    Simple utility function that returns the unixtime from a timezone aware dateTime object
+    :param dt_object:
+    :return:
+    """
     utc = pytz.UTC
     d = dt_object.astimezone(utc)
 

+ 2 - 0
requirements.txt

@@ -2,6 +2,8 @@ beautifulsoup4==4.7.1
 certifi==2019.6.16
 chardet==3.0.4
 idna==2.8
+psycopg2==2.8.3
+python-dotenv==0.10.3
 pytz==2019.1
 requests==2.22.0
 soupsieve==1.9.1

+ 21 - 0
view.py

@@ -0,0 +1,21 @@
+import os
+import datetime
+import requests
+
+
+def broadcast(this_message):
+    """
+
+    :param this_message:
+    :return:
+    """
+    # development only
+    # load_dotenv()
+    url = os.environ["BROADCAST_URL"]
+    this_time = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M')
+    json = {'content': this_time+': '+this_message}
+    response = requests.post(url, json=json)
+    if response.status_code in [200, 204]:
+        print("Webhook executed")
+    else:
+        print("status code {}: {}".format(response.status_code, response.content.decode("utf-8")))