6 vuotta sitten · 7a6262c822
--- a/.gitignore
+++ b/.gitignore
@@ -122,4 +122,5 @@ venv.bak/
 
				 dmypy.json
			
 
				 
			
 
				 # Pyre type checker
			
 
				-.pyre/
			
 
				+.pyre/
			
 
				+.idea/workspace.xml
			
--- a/.idea/dictionaries/foppe.xml
+++ b/.idea/dictionaries/foppe.xml
@@ -2,6 +2,7 @@
 
				   <dictionary name="foppe">
			
 
				     <words>
			
 
				       <w>aest</w>
			
 
				+      <w>aspx</w>
			
 
				       <w>beautifulsoup</w>
			
 
				       <w>pytz</w>
			
 
				       <w>unixtime</w>
			
--- a/_bs.py
+++ b/_bs.py
@@ -13,10 +13,21 @@ This module contains custom methods based on bs4.beautifulsoup to analyze data
 
				 
			
 
				 base_url = 'https://racingaustralia.horse/FreeFields/'
			
 
				 Venue = collections.namedtuple('Venue', 'state, name')
			
 
				-RaceDay = collections.namedtuple('RaceDay', Venue._fields + ('date_string', 'scratchings_url'))
			
 
				+RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'scratchings_url'))
			
 
				+# noinspection PyProtectedMember,PyUnresolvedReferences
			
 
				+RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
			
 
				+    'scratchings_latest_datetime', 'scratchings_latest_unixtime',
			
 
				+    'scratchings_close_datetime', 'scratchings_close_unixtime'))
			
 
				 
			
 
				 
			
 
				 def get_today_row(this_text):
			
 
				+    """
			
 
				+    Traverses the main table on the front page of https://racingaustralia.horse.
			
 
				+    This function scrapes Venue information and race day information.
			
 
				+    Unfortunately there is no clever way to split this function into two parts.
			
 
				+    :param this_text:
			
 
				+    :return RaceDay this_race_day:
			
 
				+    """
			
 
				     this_soup = BeautifulSoup(this_text, 'html.parser')
			
 
				     rows = this_soup.select('tr.rows')
			
 
				     my_row = rows[2]
			
@@ -27,28 +38,42 @@ def get_today_row(this_text):
 
				     day = 'Unknown'
			
 
				     for cell in cells:
			
 
				         if i == 0:
			
 
				+            # First cell contains date information
			
 
				             day = cell.find('span').getText()
			
 
				             # print("date: {}".format(day))
			
 
				             i += 1
			
 
				             continue
			
 
				         venue_text = cell.find('p').getText().strip()
			
 
				         if len(venue_text) > 0:
			
 
				-            # print("{}: {}".format(states[i-1], venue_text))
			
 
				+            # Cell is not empty
			
 
				+            print(venue_text)
			
 
				             this_a = cell.findAll('a')  # .get('href')
			
 
				             for a in this_a:
			
 
				+                # There may be several links in a cell (which represents a state)
			
 
				                 venue_name = a.getText().strip()
			
 
				                 this_venue = Venue(states[i - 1], venue_name)
			
 
				                 date_string = day
			
 
				                 this_url = a.get('href')
			
 
				-                scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
			
 
				-                scratchings_url = base_url + scratchings_url
			
 
				-                this_race_day = RaceDay(this_venue.state, this_venue.name, date_string, scratchings_url)
			
 
				-                all_race_days.append(this_race_day)
			
 
				+                if this_url:
			
 
				+                    # Create the Scratchings URL by substitution
			
 
				+                    scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
			
 
				+                    scratchings_url = base_url + scratchings_url
			
 
				+                    this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, scratchings_url)
			
 
				+                    all_race_days.append(this_race_day)
			
 
				         i += 1
			
 
				     return all_race_days
			
 
				 
			
 
				 
			
 
				-def get_meta_data(this_data):
			
 
				+def get_meta_data(this_data, this_venue):
			
 
				+    """
			
 
				+    Meta data is on the top-right of the Scratchings page. It contains a date and time for
			
 
				+    the latest update as well as the closing of reporting of Scratchings.
			
 
				+    This function scrapes both dateTimes and converts to unixtime (which is timezone unaware)
			
 
				+    The RaceDay namedTuple is accordingly extended.
			
 
				+    :param this_data:
			
 
				+    :param this_venue:
			
 
				+    :return:
			
 
				+    """
			
 
				     this_soup = BeautifulSoup(this_data, 'html.parser')
			
 
				     this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
			
 
				     last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
			
@@ -60,20 +85,21 @@ def get_meta_data(this_data):
 
				         this_meta_data = this_meta_data.getText()
			
 
				         match = last_published_regex.search(this_meta_data)
			
 
				         if match:
			
 
				-            print(match[1])
			
 
				+            # print(match[1])
			
 
				             times[0] = match[1][:-5]
			
 
				             # times[0] = 'Thu 20-Jun-19 7:42AM'
			
 
				             l_time = datetime.datetime.strptime(times[0], time_format)
			
 
				             # print(aest.localize(l_time))
			
 
				             times[1] = model.convert_to_unixtime(aest.localize(l_time))
			
 
				-            print(times[1])
			
 
				+            # print(times[1])
			
 
				         match = close_regex.search(this_meta_data)
			
 
				         if match:
			
 
				-            print(match[1])
			
 
				+            # print(match[1])
			
 
				             times[2] = match[1][:-5]
			
 
				             l_time = datetime.datetime.strptime(times[2], time_format)
			
 
				             # print(aest.localize(l_time))
			
 
				             times[3] = model.convert_to_unixtime(aest.localize(l_time))
			
 
				-            print(times[3])
			
 
				-    return times
			
 
				-    # print(this_meta_data)
			
 
				+            # print(times[3])
			
 
				+    race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string, this_venue.scratchings_url,
			
 
				+                       times[0], times[1], times[2], times[3])
			
 
				+    return race_day
			
--- a/main.py
+++ b/main.py
@@ -6,9 +6,9 @@ from pprint import pprint
 
				 if __name__ == '__main__':
			
 
				     # current_timezone = os.environ['TZ']
			
 
				 
			
 
				-    venues = model.scrape_main_page()
			
 
				-    pprint(venues)
			
 
				-    for venue in venues:
			
 
				-        model.get_scratchings(venue)
			
 
				-    # Reset timezone to its original
			
 
				-    # os.environ['TZ'] = current_timezone
			
 
				+    race_days_global = model.scrape_main_page()
			
 
				+    # pprint(race_days_global)
			
 
				+    race_days = []
			
 
				+    for race_day in race_days_global:
			
 
				+        race_days.append(model.get_scratchings(race_day))
			
 
				+    pprint(race_days)
			
--- a/model.py
+++ b/model.py
@@ -29,7 +29,8 @@ def scrape_main_page():
 
				 def get_scratchings(this_venue):
			
 
				     this_data = _html.get_page(this_venue[3])
			
 
				     # print(this_data)
			
 
				-    meta_data = _bs.get_meta_data(this_data)
			
 
				+    race_day_info = _bs.get_meta_data(this_data, this_venue)
			
 
				+    return race_day_info
			
 
				 
			
 
				 
			
 
				 def convert_to_unixtime(dt_object):