|
|
@@ -13,10 +13,21 @@ This module contains custom methods based on bs4.beautifulsoup to analyze data
|
|
|
|
|
|
base_url = 'https://racingaustralia.horse/FreeFields/'
|
|
|
Venue = collections.namedtuple('Venue', 'state, name')
|
|
|
-RaceDay = collections.namedtuple('RaceDay', Venue._fields + ('date_string', 'scratchings_url'))
|
|
|
+RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'scratchings_url'))
|
|
|
+# noinspection PyProtectedMember,PyUnresolvedReferences
|
|
|
+RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
|
|
|
+ 'scratchings_latest_datetime', 'scratchings_latest_unixtime',
|
|
|
+ 'scratchings_close_datetime', 'scratchings_close_unixtime'))
|
|
|
|
|
|
|
|
|
def get_today_row(this_text):
|
|
|
+ """
|
|
|
+ Traverses the main table on the front page of https://racingaustralia.horse.
|
|
|
+ This function scrapes Venue information and race day information.
|
|
|
+ Unfortunately there is no clever way to split this function into two parts.
|
|
|
+ :param this_text:
|
|
|
+ :return RaceDay this_race_day:
|
|
|
+ """
|
|
|
this_soup = BeautifulSoup(this_text, 'html.parser')
|
|
|
rows = this_soup.select('tr.rows')
|
|
|
my_row = rows[2]
|
|
|
@@ -27,28 +38,42 @@ def get_today_row(this_text):
|
|
|
day = 'Unknown'
|
|
|
for cell in cells:
|
|
|
if i == 0:
|
|
|
+ # First cell contains date information
|
|
|
day = cell.find('span').getText()
|
|
|
# print("date: {}".format(day))
|
|
|
i += 1
|
|
|
continue
|
|
|
venue_text = cell.find('p').getText().strip()
|
|
|
if len(venue_text) > 0:
|
|
|
- # print("{}: {}".format(states[i-1], venue_text))
|
|
|
+ # Cell is not empty
|
|
|
+ print(venue_text)
|
|
|
this_a = cell.findAll('a') # .get('href')
|
|
|
for a in this_a:
|
|
|
+ # There may be several links in a cell (which represents a state)
|
|
|
venue_name = a.getText().strip()
|
|
|
this_venue = Venue(states[i - 1], venue_name)
|
|
|
date_string = day
|
|
|
this_url = a.get('href')
|
|
|
- scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
|
|
|
- scratchings_url = base_url + scratchings_url
|
|
|
- this_race_day = RaceDay(this_venue.state, this_venue.name, date_string, scratchings_url)
|
|
|
- all_race_days.append(this_race_day)
|
|
|
+ if this_url:
|
|
|
+ # Create the Scratchings URL by substitution
|
|
|
+ scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
|
|
|
+ scratchings_url = base_url + scratchings_url
|
|
|
+ this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, scratchings_url)
|
|
|
+ all_race_days.append(this_race_day)
|
|
|
i += 1
|
|
|
return all_race_days
|
|
|
|
|
|
|
|
|
-def get_meta_data(this_data):
|
|
|
+def get_meta_data(this_data, this_venue):
|
|
|
+ """
|
|
|
+ Meta data is on the top-right of the Scratchings page. It contains a date and time for
|
|
|
+ the latest update as well as the closing of reporting of Scratchings.
|
|
|
+ This function scrapes both dateTimes and converts to unixtime (which is timezone unaware)
|
|
|
+ The RaceDay namedTuple is accordingly extended.
|
|
|
+ :param this_data:
|
|
|
+ :param this_venue:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
this_soup = BeautifulSoup(this_data, 'html.parser')
|
|
|
this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
|
|
|
last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
|
|
|
@@ -60,20 +85,21 @@ def get_meta_data(this_data):
|
|
|
this_meta_data = this_meta_data.getText()
|
|
|
match = last_published_regex.search(this_meta_data)
|
|
|
if match:
|
|
|
- print(match[1])
|
|
|
+ # print(match[1])
|
|
|
times[0] = match[1][:-5]
|
|
|
# times[0] = 'Thu 20-Jun-19 7:42AM'
|
|
|
l_time = datetime.datetime.strptime(times[0], time_format)
|
|
|
# print(aest.localize(l_time))
|
|
|
times[1] = model.convert_to_unixtime(aest.localize(l_time))
|
|
|
- print(times[1])
|
|
|
+ # print(times[1])
|
|
|
match = close_regex.search(this_meta_data)
|
|
|
if match:
|
|
|
- print(match[1])
|
|
|
+ # print(match[1])
|
|
|
times[2] = match[1][:-5]
|
|
|
l_time = datetime.datetime.strptime(times[2], time_format)
|
|
|
# print(aest.localize(l_time))
|
|
|
times[3] = model.convert_to_unixtime(aest.localize(l_time))
|
|
|
- print(times[3])
|
|
|
- return times
|
|
|
- # print(this_meta_data)
|
|
|
+ # print(times[3])
|
|
|
+ race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string, this_venue.scratchings_url,
|
|
|
+ times[0], times[1], times[2], times[3])
|
|
|
+ return race_day
|