from bs4 import BeautifulSoup import re import datetime from pytz import timezone import model import collections # import pytz from pprint import pprint """ This module contains custom methods based on bs4.beautifulsoup to analyze data """ base_url = 'https://racingaustralia.horse/FreeFields/' Venue = collections.namedtuple('Venue', 'state, name') RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'scratchings_url')) # noinspection PyProtectedMember,PyUnresolvedReferences RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + ( 'scratchings_latest_datetime', 'scratchings_latest_unixtime', 'scratchings_close_datetime', 'scratchings_close_unixtime')) Scratching = collections.namedtuple('Scratching', 'venue date race horse') def get_today_row(this_text): """ Traverses the main table on the front page of https://racingaustralia.horse. This function scrapes Venue information and race day information. Unfortunately there is no clever way to split this function into two parts. :param this_text: :return RaceDay this_race_day: """ this_soup = BeautifulSoup(this_text, 'html.parser') rows = this_soup.select('tr.rows') # print('len(rows) {}'.format(len(rows))) all_race_days = [] for day in range(len(rows)): my_row = rows[day] cells = my_row.select('td') i = 0 states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT') day = 'Unknown' for cell in cells: if i == 0: # First cell contains date information day = cell.find('span').getText() # print("date: {}".format(day)) i += 1 continue venue_text = cell.find('p').getText().strip() if len(venue_text) > 0: # Cell is not empty print(venue_text) this_a = cell.findAll('a') # .get('href') for a in this_a: # There may be several links in a cell (which represents a state) venue_name = a.getText().strip() this_venue = Venue(states[i - 1], venue_name) date_string = day this_url = a.get('href') if this_url: # Create the Scratchings URL by substitution scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url) scratchings_url = base_url + scratchings_url this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, '1970-01-01', scratchings_url) all_race_days.append(this_race_day) i += 1 return all_race_days def get_meta_data(this_data, this_venue): """ Meta data is on the top-right of the Scratchings page. It contains a date and time for the latest update as well as the closing of reporting of Scratchings. This function scrapes both dateTimes and converts to unixtime (which is timezone unaware) The RaceDay namedTuple is accordingly extended. :param this_data: :param this_venue: :return: """ this_soup = BeautifulSoup(this_data, 'html.parser') early = this_soup.select('div.large') if early: print(early.get_text()) if early and 'not currently available' in early.get_text(): print(early.get_text()) return try: this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0] except IndexError: return last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)') close_regex = re.compile('Scratching close: (.+? AEST)') # The times tuple is filled with a dateTime string then a unixtime (seconds since 1970) times = ['', 0, '', 0] time_format = '%a %d-%b-%y %I:%M%p' aest = timezone('Australia/Brisbane') if this_meta_data: this_meta_data = this_meta_data.getText() match = last_published_regex.search(this_meta_data) if match: # print(match[1]) times[0] = match[1][:-5] # times[0] = 'Thu 20-Jun-19 7:42AM' l_time = datetime.datetime.strptime(times[0], time_format) # print(aest.localize(l_time)) times[1] = model.convert_to_unixtime(aest.localize(l_time)) # print(times[1]) match = close_regex.search(this_meta_data) if match: # print(match[1]) times[2] = match[1][:-5] l_time = datetime.datetime.strptime(times[2], time_format) # print(aest.localize(l_time)) times[3] = model.convert_to_unixtime(aest.localize(l_time)) # print(times[3]) # The RaceDAy namedTuple is created and filled race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string, datetime.date.fromtimestamp(times[3]+12*60*60), this_venue.scratchings_url, times[0], times[1], times[2], times[3]) return race_day def process_scratchings(this_data, this_venue): this_soup = BeautifulSoup(this_data, 'html.parser') try: this_scr = this_soup.select('div.scratchings')[0] except IndexError: return scratchings_count = this_scr.select('table')[0].select('tr')[2].select('td')[3].getText() print('{}: scratchings_count {}'.format(this_venue.name, scratchings_count)) header = this_scr.select('h3', text=re.compile('Scratchings'))[0] div = header.findNext('table') old_race = 0 race = 0 scratchings = [] for text in div.stripped_strings: if text[:5] == 'Race ': match = re.search('^Race ([0-9]+):$', text) if match: try: race = int(match.group(1)) except ValueError: # This will happily fail in the next assert race = 0 assert race > old_race, 'race {} ! > old_race {}'.format(race, old_race) old_race = race continue if text[0] == '(': continue if len(text) > 0: if text[0:10] == 'There are ': continue try: int(text[0]) except ValueError: print('First character in line: {}'.format(text[0])) print('The start of the offending line is: {}'.format(text[0:10])) continue temp_list = Scratching(this_venue.name, this_venue.date, race, text) scratchings.append(temp_list) assert len(scratchings) == int(scratchings_count), 'len(scratchings) {} == scratchings_count {}'.format( len(scratchings), scratchings_count) pprint(scratchings) return scratchings