Foppe
/
scratchings-races


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
							from bs4 import BeautifulSoup
import re
# import datetime
# from pytz import timezone
import model
import collections
# # import pytz
# from pprint import pprint


"""
This module contains custom methods based on bs4.beautifulsoup to analyze data
"""

base_url = 'https://racingaustralia.horse/FreeFields/'
Venue = collections.namedtuple('Venue', 'state, name')
RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'program_url'))
# noinspection PyProtectedMember,PyUnresolvedReferences
RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
    'scratchings_latest_datetime', 'scratchings_latest_unixtime',
    'scratchings_close_datetime', 'scratchings_close_unixtime'))
Scratching = collections.namedtuple('Scratching', 'venue state date race horse')


def get_today_row(this_text, this_row):
    """
    Traverses the main table on the front page of https://racingaustralia.horse.
    This function scrapes Venue information and race day information.
    Unfortunately there is no clever way to split this function into two parts.
    :param this_text:
    :param this_row:
    :return RaceDay this_race_day:
    """
    this_soup = BeautifulSoup(this_text, 'html.parser')
    rows = this_soup.select('tr.rows')
    # print('len(rows) {}'.format(len(rows)))
    all_race_days = []
    days_to_check = [this_row]
    if this_row == -1:
        days_to_check = range(len(rows))
    for day in days_to_check:
        my_row = rows[day]
        cells = my_row.select('td')
        i = 0
        states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
        day = 'Unknown'
        for cell in cells:
            if i == 0:
                # First cell contains date information
                day = cell.find('span').getText()
                # print("date: {}".format(day))
                i += 1
                continue
            venue_text = cell.find('p').getText().strip()
            if len(venue_text) > 0:
                # Cell is not empty
                # print(venue_text)
                this_a = cell.findAll('a')  # .get('href')
                for a in this_a:
                    # There may be several links in a cell (which represents a state)
                    venue_name = a.getText().strip()
                    this_venue = Venue(states[i - 1], venue_name)
                    date_string = day
                    this_url = a.get('href')
                    if this_url:
                        # Create the Scratchings URL by substitution
                        program_url = re.sub(r"/(.*)\.aspx", 'RaceProgram.aspx', this_url)
                        program_url = base_url + program_url
                        calculated_date = model.convert_to_date(date_string)
                        this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
                                                     calculated_date, program_url)
                        all_race_days.append(this_race_day)
            i += 1
    return all_race_days


def separate_races(program_html):
    """
    Get the description line for each of the races from the html
    :rtype: object
    :param program_html:
    :return:
    """
    this_soup = BeautifulSoup(program_html, 'html.parser')
    table_blocks = this_soup.select('table.race-title')
    # print(len(table_blocks))
    races = []
    for table in table_blocks:
        titles = table.select('a.race-title-anchor-3')
        for title in titles:
            this_line = title.getText()
            race_match = re.search(r'^Race (\d+) - (\d{1,2}:\d{2}[AP]M) ', this_line)
            if race_match:
                races.append((race_match.group(1), race_match.group(2)))
    return races