from bs4 import BeautifulSoup import re # import datetime # from pytz import timezone import model import collections # # import pytz # from pprint import pprint """ This module contains custom methods based on bs4.beautifulsoup to analyze data """ base_url = 'https://racingaustralia.horse/FreeFields/' Venue = collections.namedtuple('Venue', 'state, name') RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'program_url')) # noinspection PyProtectedMember,PyUnresolvedReferences RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + ( 'scratchings_latest_datetime', 'scratchings_latest_unixtime', 'scratchings_close_datetime', 'scratchings_close_unixtime')) Scratching = collections.namedtuple('Scratching', 'venue state date race horse') def get_today_row(this_text, this_row): """ Traverses the main table on the front page of https://racingaustralia.horse. This function scrapes Venue information and race day information. Unfortunately there is no clever way to split this function into two parts. :param this_text: :param this_row: :return RaceDay this_race_day: """ this_soup = BeautifulSoup(this_text, 'html.parser') rows = this_soup.select('tr.rows') # print('len(rows) {}'.format(len(rows))) all_race_days = [] days_to_check = [this_row] if this_row == -1: days_to_check = range(len(rows)) for day in days_to_check: my_row = rows[day] cells = my_row.select('td') i = 0 states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT') day = 'Unknown' for cell in cells: if i == 0: # First cell contains date information day = cell.find('span').getText() # print("date: {}".format(day)) i += 1 continue venue_text = cell.find('p').getText().strip() if len(venue_text) > 0: # Cell is not empty # print(venue_text) this_a = cell.findAll('a') # .get('href') for a in this_a: # There may be several links in a cell (which represents a state) venue_name = a.getText().strip() this_venue = Venue(states[i - 1], venue_name) date_string = day this_url = a.get('href') if this_url: # Create the Scratchings URL by substitution program_url = re.sub(r"/(.*)\.aspx", 'RaceProgram.aspx', this_url) program_url = base_url + program_url calculated_date = model.convert_to_date(date_string) this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, calculated_date, program_url) all_race_days.append(this_race_day) i += 1 return all_race_days def separate_races(program_html): """ Get the description line for each of the races from the html :rtype: object :param program_html: :return: """ this_soup = BeautifulSoup(program_html, 'html.parser') table_blocks = this_soup.select('table.race-title') # print(len(table_blocks)) races = [] for table in table_blocks: titles = table.select('a.race-title-anchor-3') for title in titles: this_line = title.getText() race_match = re.search(r'^Race (\d+) - (\d{1,2}:\d{2}[AP]M) ', this_line) if race_match: races.append((race_match.group(1), race_match.group(2))) return races