|
@@ -0,0 +1,95 @@
|
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
|
+import re
|
|
|
|
|
+# import datetime
|
|
|
|
|
+# from pytz import timezone
|
|
|
|
|
+import model
|
|
|
|
|
+import collections
|
|
|
|
|
+# # import pytz
|
|
|
|
|
+# from pprint import pprint
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+"""
|
|
|
|
|
+This module contains custom methods based on bs4.beautifulsoup to analyze data
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+base_url = 'https://racingaustralia.horse/FreeFields/'
|
|
|
|
|
+Venue = collections.namedtuple('Venue', 'state, name')
|
|
|
|
|
+RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'program_url'))
|
|
|
|
|
+# noinspection PyProtectedMember,PyUnresolvedReferences
|
|
|
|
|
+RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
|
|
|
|
|
+ 'scratchings_latest_datetime', 'scratchings_latest_unixtime',
|
|
|
|
|
+ 'scratchings_close_datetime', 'scratchings_close_unixtime'))
|
|
|
|
|
+Scratching = collections.namedtuple('Scratching', 'venue state date race horse')
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def get_today_row(this_text, this_row):
|
|
|
|
|
+ """
|
|
|
|
|
+ Traverses the main table on the front page of https://racingaustralia.horse.
|
|
|
|
|
+ This function scrapes Venue information and race day information.
|
|
|
|
|
+ Unfortunately there is no clever way to split this function into two parts.
|
|
|
|
|
+ :param this_text:
|
|
|
|
|
+ :param this_row:
|
|
|
|
|
+ :return RaceDay this_race_day:
|
|
|
|
|
+ """
|
|
|
|
|
+ this_soup = BeautifulSoup(this_text, 'html.parser')
|
|
|
|
|
+ rows = this_soup.select('tr.rows')
|
|
|
|
|
+ # print('len(rows) {}'.format(len(rows)))
|
|
|
|
|
+ all_race_days = []
|
|
|
|
|
+ days_to_check = [this_row]
|
|
|
|
|
+ if this_row == -1:
|
|
|
|
|
+ days_to_check = range(len(rows))
|
|
|
|
|
+ for day in days_to_check:
|
|
|
|
|
+ my_row = rows[day]
|
|
|
|
|
+ cells = my_row.select('td')
|
|
|
|
|
+ i = 0
|
|
|
|
|
+ states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
|
|
|
|
|
+ day = 'Unknown'
|
|
|
|
|
+ for cell in cells:
|
|
|
|
|
+ if i == 0:
|
|
|
|
|
+ # First cell contains date information
|
|
|
|
|
+ day = cell.find('span').getText()
|
|
|
|
|
+ # print("date: {}".format(day))
|
|
|
|
|
+ i += 1
|
|
|
|
|
+ continue
|
|
|
|
|
+ venue_text = cell.find('p').getText().strip()
|
|
|
|
|
+ if len(venue_text) > 0:
|
|
|
|
|
+ # Cell is not empty
|
|
|
|
|
+ # print(venue_text)
|
|
|
|
|
+ this_a = cell.findAll('a') # .get('href')
|
|
|
|
|
+ for a in this_a:
|
|
|
|
|
+ # There may be several links in a cell (which represents a state)
|
|
|
|
|
+ venue_name = a.getText().strip()
|
|
|
|
|
+ this_venue = Venue(states[i - 1], venue_name)
|
|
|
|
|
+ date_string = day
|
|
|
|
|
+ this_url = a.get('href')
|
|
|
|
|
+ if this_url:
|
|
|
|
|
+ # Create the Scratchings URL by substitution
|
|
|
|
|
+ program_url = re.sub(r"/(.*)\.aspx", 'RaceProgram.aspx', this_url)
|
|
|
|
|
+ program_url = base_url + program_url
|
|
|
|
|
+ calculated_date = model.convert_to_date(date_string)
|
|
|
|
|
+ this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
|
|
|
|
|
+ calculated_date, program_url)
|
|
|
|
|
+ all_race_days.append(this_race_day)
|
|
|
|
|
+ i += 1
|
|
|
|
|
+ return all_race_days
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def separate_races(program_html):
|
|
|
|
|
+ """
|
|
|
|
|
+ Get the description line for each of the races from the html
|
|
|
|
|
+ :rtype: object
|
|
|
|
|
+ :param program_html:
|
|
|
|
|
+ :return:
|
|
|
|
|
+ """
|
|
|
|
|
+ this_soup = BeautifulSoup(program_html, 'html.parser')
|
|
|
|
|
+ table_blocks = this_soup.select('table.race-title')
|
|
|
|
|
+ # print(len(table_blocks))
|
|
|
|
|
+ races = []
|
|
|
|
|
+ for table in table_blocks:
|
|
|
|
|
+ titles = table.select('a.race-title-anchor-3')
|
|
|
|
|
+ for title in titles:
|
|
|
|
|
+ this_line = title.getText()
|
|
|
|
|
+ race_match = re.search(r'^Race (\d+) - (\d{1,2}:\d{2}[AP]M) ', this_line)
|
|
|
|
|
+ if race_match:
|
|
|
|
|
+ races.append((race_match.group(1), race_match.group(2)))
|
|
|
|
|
+ return races
|