Foppe
/
scratchings


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
							from bs4 import BeautifulSoup
import re
import datetime
from pytz import timezone
import model
import collections
from string import capwords
# import pytz
from pprint import pprint
import sys
import arrow

"""
This module contains custom methods based on bs4.beautifulsoup to analyze data
"""

base_url = 'https://racingaustralia.horse/FreeFields/'
Venue = collections.namedtuple('Venue', 'state, name')
RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'scratchings_url'))
# noinspection PyProtectedMember,PyUnresolvedReferences
RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
    'scratchings_latest_datetime', 'scratchings_latest_unixtime',
    'scratchings_close_datetime', 'scratchings_close_unixtime'))
RawScratching = collections.namedtuple('RawScratching', 'venue state date race horse_no horse_display_name')
Scratching = collections.namedtuple('Scratching', 'venue state date race time utc horse_no horse_display_name torn')
RacenetRaces = collections.namedtuple('RacenetRaces', 'race_date venue state race start_time utctime')


def get_today_row(this_text, this_row):
    """
    Traverses the main table on the front page of https://racingaustralia.horse.
    This function scrapes Venue information and race day information.
    Unfortunately there is no clever way to split this function into two parts.
    :param this_text:
    :param this_row:
    :return RaceDay this_race_day:
    """
    this_soup = BeautifulSoup(this_text, 'html.parser')
    rows = this_soup.select('tr.rows')
    # print('len(rows) {}'.format(len(rows)))
    all_race_days = []
    days_to_check = [this_row]
    if this_row == -1:
        days_to_check = range(len(rows))
    for day in days_to_check:
        my_row = rows[day]
        cells = my_row.select('td')
        i = 0
        states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
        day = 'Unknown'
        for cell in cells:
            if i == 0:
                # First cell contains date information
                day = cell.find('span').getText()
                # print("date: {}".format(day))
                i += 1
                continue
            venue_text = cell.find('p').getText().strip()
            if len(venue_text) > 0:
                # Cell is not empty
                # print(venue_text)
                this_a = cell.findAll('a')  # .get('href')
                for a in this_a:
                    # There may be several links in a cell (which represents a state)
                    venue_name = a.getText().strip()
                    this_venue = Venue(states[i - 1], venue_name)
                    date_string = day
                    this_url = a.get('href')
                    if this_url:
                        # Create the Scratchings URL by substitution
                        scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
                        scratchings_url = base_url + scratchings_url
                        calculated_date = model.convert_to_date(date_string)
                        this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
                                                     calculated_date, scratchings_url)
                        all_race_days.append(this_race_day)
            i += 1
    return all_race_days


def get_meta_data(this_data, this_venue):
    """
    Meta data is on the top-right of the Scratchings page. It contains a date and time for
    the latest update as well as the closing of reporting of Scratchings.
    This function scrapes both dateTimes and converts to unixtime (which is timezone unaware)
    The RaceDay namedTuple is accordingly extended.
    :param this_data:
    :param this_venue:
    :return:
    """
    this_soup = BeautifulSoup(this_data, 'html.parser')
    early = this_soup.select('div.large')
    # if early:
    # print(early.get_text())
    if early and 'not currently available' in early.get_text():
        # print(early.get_text())
        return
    try:
        this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
    except IndexError:
        return
    last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
    close_regex = re.compile('Scratching close: (.+? AEST)')
    # The times tuple is filled with a dateTime string then a unixtime (seconds since 1970)
    times = ['', 0, '', 0]
    time_format = '%a %d-%b-%y %I:%M%p'
    aest = timezone('Australia/Brisbane')
    if this_meta_data:
        this_meta_data = this_meta_data.getText()
        match = last_published_regex.search(this_meta_data)
        if match:
            # print(this_venue.name)
            #  pprint(match)
            times[0] = match.group(1)[:-5]
            # times[0] = 'Thu 20-Jun-19 7:42AM'
            l_time = datetime.datetime.strptime(times[0], time_format)
            # print(aest.localize(l_time))
            times[1] = model.convert_to_unixtime(aest.localize(l_time))
            # print(times[1])
        match = close_regex.search(this_meta_data)
        if match:
            times[2] = match.group(1)[:-5]
            l_time = datetime.datetime.strptime(times[2], time_format)
            times[3] = model.convert_to_unixtime(aest.localize(l_time))
    # The RaceDAy namedTuple is created and filled
    race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string,
                       this_venue.date,  this_venue.scratchings_url,
                       times[0], times[1], times[2], times[3])
    return race_day


def scrape_scratchings(div, this_venue):
    old_race = 0
    race = 0
    scraped_scratchings = []
    for text in div.stripped_strings:
        if text[:5] == 'Race ':
            match = re.search('^Race ([0-9]+):$', text)
            if match:
                try:
                    race = int(match.group(1))
                except ValueError:
                    # This will happily fail in the next assert
                    race = 0
                assert race > old_race, 'race {} ! > old_race {}'.format(race, old_race)
                old_race = race
            continue
        if text[0] == '(':
            continue
        if len(text) > 0:
            if text[0:10] == 'There are ':
                continue
            try:
                int(text[0])
            except ValueError:
                print('First character in line: {}'.format(text[0]))
                print('The start of the offending line is: {}'.format(text[0:10]))
                continue
            match = re.search(r'^(\d{1,2})e?\s+(.+)', text)
            no = 0
            name = ''
            if match:
                no = int(match.group(1))
                name = capwords(match.group(2))
                name = re.sub(r' Of ', ' of ', name)
                if name.endswith('(nz)'):
                    name = name[:-len(' (nz)')]
            temp_list = RawScratching(this_venue.name, this_venue.state, this_venue.date, race, no, name)
            scraped_scratchings.append(temp_list)
    return scraped_scratchings


def process_scratchings(this_data, this_venue):
    this_soup = BeautifulSoup(this_data, 'html.parser')
    try:
        this_scr = this_soup.select('div.scratchings')[0]
    except IndexError:
        return
    scratchings_count = this_scr.select('table')[0].select('tr')[2].select('td')[3].getText()
    # print('{}: scratchings_count {}'.format(this_venue.name, scratchings_count))
    header = this_scr.findAll('h3', text=re.compile('Scratchings'))[0]
    div = header.findNext('table')

    scratchings = set()
    early_scratchings = scrape_scratchings(div, this_venue)
    scratchings.update(early_scratchings)
    # print('len(scratchings): {}'.format(len(scratchings)))
    header = this_scr.findAll('h3', text=re.compile('Late Scratchings'))[0]
    late_div = header.findNext('table')
    late_scratchings = scrape_scratchings(late_div, this_venue)
    # if this_venue.name == 'Corowa':
    #      pprint(late_div)
    #     pprint(late_scratchings)
    scratchings.update(late_scratchings)
    # print('len(scratchings): {}'.format(len(scratchings)))

    assert len(scratchings) == int(scratchings_count), 'len(scratchings) {} == scratchings_count {}'.format(
        len(scratchings), scratchings_count)
    # if len(scratchings) != int(scratchings_count):
    #    print('len(scratchings) {} == scratchings_count {}'.format(
    #        len(scratchings), scratchings_count))
    #    pprint(scratchings)
    return scratchings


def get_racenet_json(html):
    this_soup = BeautifulSoup(html, 'html.parser')
    pattern = re.compile(r'window\.initialReduxState = (.*)')
    script = this_soup.find('script', text=pattern)
    json = '{}'
    if script:
        # print('script')
        match = pattern.search(script.text)
        if match:
            # print('match')
            json = match.group(1)
        else:
            print('Failing in {}'.format("'match'"))
    else:
        print('Failing in {}'.format("'script'"))
    # pprint(json)
    return json


def get_racenet_races(html):
    """
    Analyzes the html from the races page and scrapes venue and race information
    :param html html:
    :return:
    """
    discard_non_tab = True
    discard_barrier_trials = True
    this_soup = BeautifulSoup(html, 'html.parser')
    tables = this_soup.find_all('table', class_='table-race-meetings')
    venues = []
    date_text = ''
    venue_text = ''
    venue_state = ''
    race_number = ''
    date_parsed = arrow.get('Wednesday 14 August 2019', 'dddd DD MMMM YYYY')
    venue_name = ''
    print('{} tables found'.format(len(tables)))
    regex_time = re.compile('([\d]{2}:[\d]{2})')
    regex_venue_state = re.compile('([ \w]+) \(([A-Z]{2,3})\)$')
    if tables:
        for table in tables:
            body = None
            if table:
                # if "table-race-meetings--trials" in table.attrs['class']:
                #     # print('This is a trial meeting')
                #     continue
                tab_panel = table.find_previous('div', id='meetinglist_tab_6')
                if tab_panel:
                    continue
                date_div = table.find_previous('div', class_='race-meetings-section-header')
                if date_div:
                    venue_h2 = date_div.find('h2', class_='race-meetings-section-title')
                    if venue_h2:
                        venue_text = venue_h2.getText()
                        if discard_barrier_trials and 'Barrier Trials' in venue_text:
                            print('Skipped `Barrier Trials` {}'.format(venue_text))
                            continue
                    date_span = date_div.find('span', class_='race-meetings-section-date')
                    if date_span:
                        date_text = date_span.getText()
                        date_parsed = arrow.get(date_text, 'dddd DD MMMM YYYY')
                    print('{} {} - {}'.format(date_parsed.date(), venue_text, date_text))
                body = table.find('tbody')
            else:
                print('No `table` found')
                # sys.exit(1)
            if body:
                all_rows = body.find_all('tr')
            else:
                print('No `body` found')
                continue
                # sys.exit(1)
            for row in all_rows:
                for td in row.find_all('td'):
                    venue_selector = td.find('h3')
                    if venue_selector:
                        venue_name = venue_selector.get_text()
                        venue_name = venue_name.strip()
                        if discard_non_tab and 'Non-TAB' in venue_name:
                            print('Skipped Non-TAB {}'.format(venue_name))
                            continue
                        venue_name = re.sub('\nNon-TAB', '', venue_name)
                        venue_name = venue_name.strip()
                        venue_match = regex_venue_state.search(venue_name)
                        if venue_match:
                            venue_name = venue_match.group(1)
                            venue_state = venue_match.group(2)
                            if venue_state == 'NZ':
                                venue_state = 'NZL'
                        # venues.append(venue_name)
                    else:
                        # print('No `venue_selector` found')
                        if td.get('class') and 'table-race-meeting-detail' in td.get('class'):
                            # print(td.get('class'))
                            time_string = td.find('span', class_='table-race-meeting-detail-info').getText()
                            time_match = regex_time.search(time_string)
                            if time_match:
                                time_string = time_match.group(1)
                            if time_string == 'TBA':
                                continue
                            race_number_padded = td.get('data-race-number')
                            if race_number_padded:
                                race_number = td.get('data-race-number')[1:]
                            else:
                                # print("td.get('data-race-number'): {}".format(race_number_padded))
                                continue
                            start_time = td.get('data-start-time')
                            # print(start_time)
                            time_string_with_date = date_parsed.format('YYYY-MM-DD')+' '+time_string
                            # print(time_string_with_date)
                            local_time = arrow.get(time_string_with_date, 'YYYY-MM-DD HH:mm').time()
                            utc_time = arrow.get(int(start_time)/1000).datetime
                            # print("td.get('data-race-number'): {}". format(race_number))
                            # print("td.get('data-start-time'): {}".format(start_time))
                            # print("time_string: {}".format(time_string))
                            # All data is collected so we can populate the namedTuple
                            racenet_race = RacenetRaces(date_parsed.date(), venue_name, venue_state, race_number,
                                                        local_time, utc_time)
                            venues.append(racenet_race)
                        continue

    else:
        print('No `tables` found')
        sys.exit(1)
    pprint(venues)
    print('{} venues found'.format(len(venues)))
    return venues