|
|
@@ -6,7 +6,9 @@ import model
|
|
|
import collections
|
|
|
from string import capwords
|
|
|
# import pytz
|
|
|
-# from pprint import pprint
|
|
|
+from pprint import pprint
|
|
|
+import sys
|
|
|
+import arrow
|
|
|
|
|
|
"""
|
|
|
This module contains custom methods based on bs4.beautifulsoup to analyze data
|
|
|
@@ -21,6 +23,7 @@ RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
|
|
|
'scratchings_close_datetime', 'scratchings_close_unixtime'))
|
|
|
RawScratching = collections.namedtuple('RawScratching', 'venue state date race horse_no horse_display_name')
|
|
|
Scratching = collections.namedtuple('Scratching', 'venue state date race time utc horse_no horse_display_name torn')
|
|
|
+RacenetRaces = collections.namedtuple('RacenetRaces', 'date venue_name state race_no local_time unix_time')
|
|
|
|
|
|
|
|
|
def get_today_row(this_text, this_row):
|
|
|
@@ -219,3 +222,98 @@ def get_racenet_json(html):
|
|
|
print('Failing in {}'.format("'script'"))
|
|
|
# pprint(json)
|
|
|
return json
|
|
|
+
|
|
|
+
|
|
|
+def get_racenet_races(html):
|
|
|
+ """
|
|
|
+ Analyzes the html from the races page and scrapes venue and race information
|
|
|
+ :param html html:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ discard_non_tab = True
|
|
|
+ discard_barrier_trials = True
|
|
|
+ this_soup = BeautifulSoup(html, 'html.parser')
|
|
|
+ all_rows = []
|
|
|
+ tables = this_soup.find_all('table', class_='table-race-meetings')
|
|
|
+ venues = []
|
|
|
+ date_text = ''
|
|
|
+ venue_text = ''
|
|
|
+ venue_state = ''
|
|
|
+ print('{} tables found'.format(len(tables)))
|
|
|
+ regex_time = re.compile('(\d{2}:\d{2})')
|
|
|
+ regex_venue_state = re.compile('([ \w]+) \(([A-Z]{2,3})\)$')
|
|
|
+ if tables:
|
|
|
+ for table in tables:
|
|
|
+ body = None
|
|
|
+ if table:
|
|
|
+ # if "table-race-meetings--trials" in table.attrs['class']:
|
|
|
+ # # print('This is a trial meeting')
|
|
|
+ # continue
|
|
|
+ tab_panel = table.find_previous('div', id='meetinglist_tab_6')
|
|
|
+ if tab_panel:
|
|
|
+ continue
|
|
|
+ date_div = table.find_previous('div', class_='race-meetings-section-header')
|
|
|
+ if date_div:
|
|
|
+ venue_h2 = date_div.find('h2', class_='race-meetings-section-title')
|
|
|
+ if venue_h2:
|
|
|
+ venue_text = venue_h2.getText()
|
|
|
+ if discard_barrier_trials and venue_text == 'Barrier Trials':
|
|
|
+ continue
|
|
|
+ date_span = date_div.find('span', class_='race-meetings-section-date')
|
|
|
+ if date_span:
|
|
|
+ date_text = date_span.getText()
|
|
|
+ date_parsed = arrow.get(date_text, 'dddd DD MMMM YYYY')
|
|
|
+ print('{} {} - {}'.format(date_parsed.date(), venue_text, date_text))
|
|
|
+ body = table.find('tbody')
|
|
|
+ else:
|
|
|
+ print('No `table` found')
|
|
|
+ # sys.exit(1)
|
|
|
+ if body:
|
|
|
+ all_rows = body.find_all('tr')
|
|
|
+ else:
|
|
|
+ print('No `body` found')
|
|
|
+ continue
|
|
|
+ # sys.exit(1)
|
|
|
+ for row in all_rows:
|
|
|
+ for td in row.find_all('td'):
|
|
|
+ venue_selector = td.find('h3')
|
|
|
+ if venue_selector:
|
|
|
+ venue_name = venue_selector.get_text()
|
|
|
+ venue_name = venue_name.strip()
|
|
|
+ if discard_non_tab and 'Non-TAB' in venue_name:
|
|
|
+ continue
|
|
|
+ venue_name = re.sub('\nNon-TAB', '', venue_name)
|
|
|
+ venue_name = venue_name.strip()
|
|
|
+ venue_match = regex_venue_state.search(venue_name)
|
|
|
+ if venue_match:
|
|
|
+ venue_name = venue_match.group(1)
|
|
|
+ venue_state = venue_match.group(2)
|
|
|
+ if venue_state == 'NZ':
|
|
|
+ venue_state = 'NZL'
|
|
|
+ # venues.append(venue_name)
|
|
|
+ else:
|
|
|
+ # print('No `venue_selector` found')
|
|
|
+ if td.get('class') and 'table-race-meeting-detail' in td.get('class'):
|
|
|
+ # print(td.get('class'))
|
|
|
+ time_string = td.find('span', class_='table-race-meeting-detail-info').getText()
|
|
|
+ time_match = regex_time.search(time_string)
|
|
|
+ if time_match:
|
|
|
+ time_string = time_match.group(1)
|
|
|
+ if time_string == 'TBA':
|
|
|
+ continue
|
|
|
+ race_number = td.get('data-race-number')[1:]
|
|
|
+ start_time = td.get('data-start-time')
|
|
|
+ # print("td.get('data-race-number'): {}". format(race_number))
|
|
|
+ # print("td.get('data-start-time'): {}".format(start_time))
|
|
|
+ # print("time_string: {}".format(time_string))
|
|
|
+ # All data is collected so we can populate the namedTuple
|
|
|
+ racenet_race = RacenetRaces(date_parsed.date(), venue_name, venue_state, race_number,
|
|
|
+ time_string, start_time)
|
|
|
+ venues.append(racenet_race)
|
|
|
+ continue
|
|
|
+
|
|
|
+ else:
|
|
|
+ print('No `tables` found')
|
|
|
+ sys.exit(1)
|
|
|
+ pprint(venues)
|
|
|
+ print('{} venues found'.format(len(venues)))
|