| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324 |
- from bs4 import BeautifulSoup
- import re
- import datetime
- from pytz import timezone
- import model
- import collections
- from string import capwords
- # import pytz
- from pprint import pprint
- import sys
- import arrow
- """
- This module contains custom methods based on bs4.beautifulsoup to analyze data
- """
- base_url = 'https://racingaustralia.horse/FreeFields/'
- Venue = collections.namedtuple('Venue', 'state, name')
- RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'scratchings_url'))
- # noinspection PyProtectedMember,PyUnresolvedReferences
- RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
- 'scratchings_latest_datetime', 'scratchings_latest_unixtime',
- 'scratchings_close_datetime', 'scratchings_close_unixtime'))
- RawScratching = collections.namedtuple('RawScratching', 'venue state date race horse_no horse_display_name')
- Scratching = collections.namedtuple('Scratching', 'venue state date race time utc horse_no horse_display_name torn')
- RacenetRaces = collections.namedtuple('RacenetRaces', 'race_date venue state race start_time utctime')
- def get_today_row(this_text, this_row):
- """
- Traverses the main table on the front page of https://racingaustralia.horse.
- This function scrapes Venue information and race day information.
- Unfortunately there is no clever way to split this function into two parts.
- :param this_text:
- :param this_row:
- :return RaceDay this_race_day:
- """
- this_soup = BeautifulSoup(this_text, 'html.parser')
- rows = this_soup.select('tr.rows')
- # print('len(rows) {}'.format(len(rows)))
- all_race_days = []
- days_to_check = [this_row]
- if this_row == -1:
- days_to_check = range(len(rows))
- for day in days_to_check:
- my_row = rows[day]
- cells = my_row.select('td')
- i = 0
- states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
- day = 'Unknown'
- for cell in cells:
- if i == 0:
- # First cell contains date information
- day = cell.find('span').getText()
- # print("date: {}".format(day))
- i += 1
- continue
- venue_text = cell.find('p').getText().strip()
- if len(venue_text) > 0:
- # Cell is not empty
- # print(venue_text)
- this_a = cell.findAll('a') # .get('href')
- for a in this_a:
- # There may be several links in a cell (which represents a state)
- venue_name = a.getText().strip()
- this_venue = Venue(states[i - 1], venue_name)
- date_string = day
- this_url = a.get('href')
- if this_url:
- # Create the Scratchings URL by substitution
- scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
- scratchings_url = base_url + scratchings_url
- calculated_date = model.convert_to_date(date_string)
- this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
- calculated_date, scratchings_url)
- all_race_days.append(this_race_day)
- i += 1
- return all_race_days
- def get_meta_data(this_data, this_venue):
- """
- Meta data is on the top-right of the Scratchings page. It contains a date and time for
- the latest update as well as the closing of reporting of Scratchings.
- This function scrapes both dateTimes and converts to unixtime (which is timezone unaware)
- The RaceDay namedTuple is accordingly extended.
- :param this_data:
- :param this_venue:
- :return:
- """
- this_soup = BeautifulSoup(this_data, 'html.parser')
- early = this_soup.select('div.large')
- # if early:
- # print(early.get_text())
- if early and 'not currently available' in early.get_text():
- # print(early.get_text())
- return
- try:
- this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
- except IndexError:
- return
- last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
- close_regex = re.compile('Scratching close: (.+? AEST)')
- # The times tuple is filled with a dateTime string then a unixtime (seconds since 1970)
- times = ['', 0, '', 0]
- time_format = '%a %d-%b-%y %I:%M%p'
- aest = timezone('Australia/Brisbane')
- if this_meta_data:
- this_meta_data = this_meta_data.getText()
- match = last_published_regex.search(this_meta_data)
- if match:
- # print(this_venue.name)
- # pprint(match)
- times[0] = match.group(1)[:-5]
- # times[0] = 'Thu 20-Jun-19 7:42AM'
- l_time = datetime.datetime.strptime(times[0], time_format)
- # print(aest.localize(l_time))
- times[1] = model.convert_to_unixtime(aest.localize(l_time))
- # print(times[1])
- match = close_regex.search(this_meta_data)
- if match:
- times[2] = match.group(1)[:-5]
- l_time = datetime.datetime.strptime(times[2], time_format)
- times[3] = model.convert_to_unixtime(aest.localize(l_time))
- # The RaceDAy namedTuple is created and filled
- race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string,
- this_venue.date, this_venue.scratchings_url,
- times[0], times[1], times[2], times[3])
- return race_day
- def scrape_scratchings(div, this_venue):
- old_race = 0
- race = 0
- scraped_scratchings = []
- for text in div.stripped_strings:
- if text[:5] == 'Race ':
- match = re.search('^Race ([0-9]+):$', text)
- if match:
- try:
- race = int(match.group(1))
- except ValueError:
- # This will happily fail in the next assert
- race = 0
- assert race > old_race, 'race {} ! > old_race {}'.format(race, old_race)
- old_race = race
- continue
- if text[0] == '(':
- continue
- if len(text) > 0:
- if text[0:10] == 'There are ':
- continue
- try:
- int(text[0])
- except ValueError:
- print('First character in line: {}'.format(text[0]))
- print('The start of the offending line is: {}'.format(text[0:10]))
- continue
- match = re.search(r'^(\d{1,2})e?\s+(.+)', text)
- no = 0
- name = ''
- if match:
- no = int(match.group(1))
- name = capwords(match.group(2))
- name = re.sub(r' Of ', ' of ', name)
- if name.endswith('(nz)'):
- name = name[:-len(' (nz)')]
- temp_list = RawScratching(this_venue.name, this_venue.state, this_venue.date, race, no, name)
- scraped_scratchings.append(temp_list)
- return scraped_scratchings
- def process_scratchings(this_data, this_venue):
- this_soup = BeautifulSoup(this_data, 'html.parser')
- try:
- this_scr = this_soup.select('div.scratchings')[0]
- except IndexError:
- return
- scratchings_count = this_scr.select('table')[0].select('tr')[2].select('td')[3].getText()
- # print('{}: scratchings_count {}'.format(this_venue.name, scratchings_count))
- header = this_scr.findAll('h3', text=re.compile('Scratchings'))[0]
- div = header.findNext('table')
- scratchings = set()
- early_scratchings = scrape_scratchings(div, this_venue)
- scratchings.update(early_scratchings)
- # print('len(scratchings): {}'.format(len(scratchings)))
- header = this_scr.findAll('h3', text=re.compile('Late Scratchings'))[0]
- late_div = header.findNext('table')
- late_scratchings = scrape_scratchings(late_div, this_venue)
- # if this_venue.name == 'Corowa':
- # pprint(late_div)
- # pprint(late_scratchings)
- scratchings.update(late_scratchings)
- # print('len(scratchings): {}'.format(len(scratchings)))
- assert len(scratchings) == int(scratchings_count), 'len(scratchings) {} == scratchings_count {}'.format(
- len(scratchings), scratchings_count)
- # if len(scratchings) != int(scratchings_count):
- # print('len(scratchings) {} == scratchings_count {}'.format(
- # len(scratchings), scratchings_count))
- # pprint(scratchings)
- return scratchings
- def get_racenet_json(html):
- this_soup = BeautifulSoup(html, 'html.parser')
- pattern = re.compile(r'window\.initialReduxState = (.*)')
- script = this_soup.find('script', text=pattern)
- json = '{}'
- if script:
- # print('script')
- match = pattern.search(script.text)
- if match:
- # print('match')
- json = match.group(1)
- else:
- print('Failing in {}'.format("'match'"))
- else:
- print('Failing in {}'.format("'script'"))
- # pprint(json)
- return json
- def get_racenet_races(html):
- """
- Analyzes the html from the races page and scrapes venue and race information
- :param html html:
- :return:
- """
- discard_non_tab = True
- discard_barrier_trials = True
- this_soup = BeautifulSoup(html, 'html.parser')
- all_rows = []
- tables = this_soup.find_all('table', class_='table-race-meetings')
- venues = []
- date_text = ''
- venue_text = ''
- venue_state = ''
- print('{} tables found'.format(len(tables)))
- regex_time = re.compile('(\d{2}:\d{2})')
- regex_venue_state = re.compile('([ \w]+) \(([A-Z]{2,3})\)$')
- if tables:
- for table in tables:
- body = None
- if table:
- # if "table-race-meetings--trials" in table.attrs['class']:
- # # print('This is a trial meeting')
- # continue
- tab_panel = table.find_previous('div', id='meetinglist_tab_6')
- if tab_panel:
- continue
- date_div = table.find_previous('div', class_='race-meetings-section-header')
- if date_div:
- venue_h2 = date_div.find('h2', class_='race-meetings-section-title')
- if venue_h2:
- venue_text = venue_h2.getText()
- if discard_barrier_trials and venue_text == 'Barrier Trials':
- continue
- date_span = date_div.find('span', class_='race-meetings-section-date')
- if date_span:
- date_text = date_span.getText()
- date_parsed = arrow.get(date_text, 'dddd DD MMMM YYYY')
- print('{} {} - {}'.format(date_parsed.date(), venue_text, date_text))
- body = table.find('tbody')
- else:
- print('No `table` found')
- # sys.exit(1)
- if body:
- all_rows = body.find_all('tr')
- else:
- print('No `body` found')
- continue
- # sys.exit(1)
- for row in all_rows:
- for td in row.find_all('td'):
- venue_selector = td.find('h3')
- if venue_selector:
- venue_name = venue_selector.get_text()
- venue_name = venue_name.strip()
- if discard_non_tab and 'Non-TAB' in venue_name:
- continue
- venue_name = re.sub('\nNon-TAB', '', venue_name)
- venue_name = venue_name.strip()
- venue_match = regex_venue_state.search(venue_name)
- if venue_match:
- venue_name = venue_match.group(1)
- venue_state = venue_match.group(2)
- if venue_state == 'NZ':
- venue_state = 'NZL'
- # venues.append(venue_name)
- else:
- # print('No `venue_selector` found')
- if td.get('class') and 'table-race-meeting-detail' in td.get('class'):
- # print(td.get('class'))
- time_string = td.find('span', class_='table-race-meeting-detail-info').getText()
- time_match = regex_time.search(time_string)
- if time_match:
- time_string = time_match.group(1)
- if time_string == 'TBA':
- continue
- if race_number:
- race_number = td.get('data-race-number')[1:]
- else:
- continue
- start_time = td.get('data-start-time')
- local_time = arrow.get(date_parsed.format('YYYY-MM-DD')+' '+time_string, 'YYYY-MM-DD HH:mm').time()
- print(start_time)
- utc_time = arrow.get(int(start_time)/1000).datetime
- # print("td.get('data-race-number'): {}". format(race_number))
- # print("td.get('data-start-time'): {}".format(start_time))
- # print("time_string: {}".format(time_string))
- # All data is collected so we can populate the namedTuple
- racenet_race = RacenetRaces(date_parsed.date(), venue_name, venue_state, race_number,
- local_time, utc_time)
- venues.append(racenet_race)
- continue
- else:
- print('No `tables` found')
- sys.exit(1)
- pprint(venues)
- print('{} venues found'.format(len(venues)))
- return venues
|