|
@@ -5,6 +5,7 @@ from pytz import timezone
|
|
|
import model
|
|
import model
|
|
|
import collections
|
|
import collections
|
|
|
# import pytz
|
|
# import pytz
|
|
|
|
|
+from pprint import pprint
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
"""
|
|
@@ -13,11 +14,12 @@ This module contains custom methods based on bs4.beautifulsoup to analyze data
|
|
|
|
|
|
|
|
base_url = 'https://racingaustralia.horse/FreeFields/'
|
|
base_url = 'https://racingaustralia.horse/FreeFields/'
|
|
|
Venue = collections.namedtuple('Venue', 'state, name')
|
|
Venue = collections.namedtuple('Venue', 'state, name')
|
|
|
-RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'scratchings_url'))
|
|
|
|
|
|
|
+RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'scratchings_url'))
|
|
|
# noinspection PyProtectedMember,PyUnresolvedReferences
|
|
# noinspection PyProtectedMember,PyUnresolvedReferences
|
|
|
RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
|
|
RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
|
|
|
'scratchings_latest_datetime', 'scratchings_latest_unixtime',
|
|
'scratchings_latest_datetime', 'scratchings_latest_unixtime',
|
|
|
'scratchings_close_datetime', 'scratchings_close_unixtime'))
|
|
'scratchings_close_datetime', 'scratchings_close_unixtime'))
|
|
|
|
|
+Scratching = collections.namedtuple('Scratching', 'venue date race horse')
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_today_row(this_text):
|
|
def get_today_row(this_text):
|
|
@@ -30,37 +32,40 @@ def get_today_row(this_text):
|
|
|
"""
|
|
"""
|
|
|
this_soup = BeautifulSoup(this_text, 'html.parser')
|
|
this_soup = BeautifulSoup(this_text, 'html.parser')
|
|
|
rows = this_soup.select('tr.rows')
|
|
rows = this_soup.select('tr.rows')
|
|
|
- my_row = rows[2]
|
|
|
|
|
- cells = my_row.select('td')
|
|
|
|
|
- i = 0
|
|
|
|
|
- states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
|
|
|
|
|
|
|
+ # print('len(rows) {}'.format(len(rows)))
|
|
|
all_race_days = []
|
|
all_race_days = []
|
|
|
- day = 'Unknown'
|
|
|
|
|
- for cell in cells:
|
|
|
|
|
- if i == 0:
|
|
|
|
|
- # First cell contains date information
|
|
|
|
|
- day = cell.find('span').getText()
|
|
|
|
|
- # print("date: {}".format(day))
|
|
|
|
|
|
|
+ for day in range(len(rows)):
|
|
|
|
|
+ my_row = rows[day]
|
|
|
|
|
+ cells = my_row.select('td')
|
|
|
|
|
+ i = 0
|
|
|
|
|
+ states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
|
|
|
|
|
+ day = 'Unknown'
|
|
|
|
|
+ for cell in cells:
|
|
|
|
|
+ if i == 0:
|
|
|
|
|
+ # First cell contains date information
|
|
|
|
|
+ day = cell.find('span').getText()
|
|
|
|
|
+ # print("date: {}".format(day))
|
|
|
|
|
+ i += 1
|
|
|
|
|
+ continue
|
|
|
|
|
+ venue_text = cell.find('p').getText().strip()
|
|
|
|
|
+ if len(venue_text) > 0:
|
|
|
|
|
+ # Cell is not empty
|
|
|
|
|
+ print(venue_text)
|
|
|
|
|
+ this_a = cell.findAll('a') # .get('href')
|
|
|
|
|
+ for a in this_a:
|
|
|
|
|
+ # There may be several links in a cell (which represents a state)
|
|
|
|
|
+ venue_name = a.getText().strip()
|
|
|
|
|
+ this_venue = Venue(states[i - 1], venue_name)
|
|
|
|
|
+ date_string = day
|
|
|
|
|
+ this_url = a.get('href')
|
|
|
|
|
+ if this_url:
|
|
|
|
|
+ # Create the Scratchings URL by substitution
|
|
|
|
|
+ scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
|
|
|
|
|
+ scratchings_url = base_url + scratchings_url
|
|
|
|
|
+ this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
|
|
|
|
|
+ '1970-01-01', scratchings_url)
|
|
|
|
|
+ all_race_days.append(this_race_day)
|
|
|
i += 1
|
|
i += 1
|
|
|
- continue
|
|
|
|
|
- venue_text = cell.find('p').getText().strip()
|
|
|
|
|
- if len(venue_text) > 0:
|
|
|
|
|
- # Cell is not empty
|
|
|
|
|
- print(venue_text)
|
|
|
|
|
- this_a = cell.findAll('a') # .get('href')
|
|
|
|
|
- for a in this_a:
|
|
|
|
|
- # There may be several links in a cell (which represents a state)
|
|
|
|
|
- venue_name = a.getText().strip()
|
|
|
|
|
- this_venue = Venue(states[i - 1], venue_name)
|
|
|
|
|
- date_string = day
|
|
|
|
|
- this_url = a.get('href')
|
|
|
|
|
- if this_url:
|
|
|
|
|
- # Create the Scratchings URL by substitution
|
|
|
|
|
- scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
|
|
|
|
|
- scratchings_url = base_url + scratchings_url
|
|
|
|
|
- this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, scratchings_url)
|
|
|
|
|
- all_race_days.append(this_race_day)
|
|
|
|
|
- i += 1
|
|
|
|
|
return all_race_days
|
|
return all_race_days
|
|
|
|
|
|
|
|
|
|
|
|
@@ -75,9 +80,19 @@ def get_meta_data(this_data, this_venue):
|
|
|
:return:
|
|
:return:
|
|
|
"""
|
|
"""
|
|
|
this_soup = BeautifulSoup(this_data, 'html.parser')
|
|
this_soup = BeautifulSoup(this_data, 'html.parser')
|
|
|
- this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
|
|
|
|
|
|
|
+ early = this_soup.select('div.large')
|
|
|
|
|
+ if early:
|
|
|
|
|
+ print(early.get_text())
|
|
|
|
|
+ if early and 'not currently available' in early.get_text():
|
|
|
|
|
+ print(early.get_text())
|
|
|
|
|
+ return
|
|
|
|
|
+ try:
|
|
|
|
|
+ this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
|
|
|
|
|
+ except IndexError:
|
|
|
|
|
+ return
|
|
|
last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
|
|
last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
|
|
|
close_regex = re.compile('Scratching close: (.+? AEST)')
|
|
close_regex = re.compile('Scratching close: (.+? AEST)')
|
|
|
|
|
+ # The times tuple is filled with a dateTime string then a unixtime (seconds since 1970)
|
|
|
times = ['', 0, '', 0]
|
|
times = ['', 0, '', 0]
|
|
|
time_format = '%a %d-%b-%y %I:%M%p'
|
|
time_format = '%a %d-%b-%y %I:%M%p'
|
|
|
aest = timezone('Australia/Brisbane')
|
|
aest = timezone('Australia/Brisbane')
|
|
@@ -100,6 +115,52 @@ def get_meta_data(this_data, this_venue):
|
|
|
# print(aest.localize(l_time))
|
|
# print(aest.localize(l_time))
|
|
|
times[3] = model.convert_to_unixtime(aest.localize(l_time))
|
|
times[3] = model.convert_to_unixtime(aest.localize(l_time))
|
|
|
# print(times[3])
|
|
# print(times[3])
|
|
|
- race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string, this_venue.scratchings_url,
|
|
|
|
|
- times[0], times[1], times[2], times[3])
|
|
|
|
|
|
|
+ # The RaceDAy namedTuple is created and filled
|
|
|
|
|
+ race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string,
|
|
|
|
|
+ datetime.date.fromtimestamp(times[3]+12*60*60),
|
|
|
|
|
+ this_venue.scratchings_url, times[0], times[1], times[2], times[3])
|
|
|
return race_day
|
|
return race_day
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def process_scratchings(this_data, this_venue):
|
|
|
|
|
+ this_soup = BeautifulSoup(this_data, 'html.parser')
|
|
|
|
|
+ try:
|
|
|
|
|
+ this_scr = this_soup.select('div.scratchings')[0]
|
|
|
|
|
+ except IndexError:
|
|
|
|
|
+ return
|
|
|
|
|
+ scratchings_count = this_scr.select('table')[0].select('tr')[2].select('td')[3].getText()
|
|
|
|
|
+ print('{}: scratchings_count {}'.format(this_venue.name, scratchings_count))
|
|
|
|
|
+ header = this_scr.select('h3', text=re.compile('Scratchings'))[0]
|
|
|
|
|
+ div = header.findNext('table')
|
|
|
|
|
+ old_race = 0
|
|
|
|
|
+ race = 0
|
|
|
|
|
+ scratchings = []
|
|
|
|
|
+ for text in div.stripped_strings:
|
|
|
|
|
+ if text[:5] == 'Race ':
|
|
|
|
|
+ match = re.search('^Race ([0-9]+):$', text)
|
|
|
|
|
+ if match:
|
|
|
|
|
+ try:
|
|
|
|
|
+ race = int(match.group(1))
|
|
|
|
|
+ except ValueError:
|
|
|
|
|
+ # This will happily fail in the next assert
|
|
|
|
|
+ race = 0
|
|
|
|
|
+ assert race > old_race, 'race {} ! > old_race {}'.format(race, old_race)
|
|
|
|
|
+ old_race = race
|
|
|
|
|
+ continue
|
|
|
|
|
+ if text[0] == '(':
|
|
|
|
|
+ continue
|
|
|
|
|
+ if len(text) > 0:
|
|
|
|
|
+ if text[0:10] == 'There are ':
|
|
|
|
|
+ continue
|
|
|
|
|
+ try:
|
|
|
|
|
+ int(text[0])
|
|
|
|
|
+ except ValueError:
|
|
|
|
|
+ print('First character in line: {}'.format(text[0]))
|
|
|
|
|
+ print('The start of the offending line is: {}'.format(text[0:10]))
|
|
|
|
|
+ continue
|
|
|
|
|
+ temp_list = Scratching(this_venue.name, this_venue.date, race, text)
|
|
|
|
|
+ scratchings.append(temp_list)
|
|
|
|
|
+ assert len(scratchings) == int(scratchings_count), 'len(scratchings) {} == scratchings_count {}'.format(
|
|
|
|
|
+ len(scratchings), scratchings_count)
|
|
|
|
|
+ pprint(scratchings)
|
|
|
|
|
+ return scratchings
|