Foppe
/
scratchings


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
							from bs4 import BeautifulSoup
import re
import datetime
from pytz import timezone
import model
import collections
# import pytz


"""
This module contains custom methods based on bs4.beautifulsoup to analyze data
"""

base_url = 'https://racingaustralia.horse/FreeFields/'
Venue = collections.namedtuple('Venue', 'state, name')
RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'scratchings_url'))
# noinspection PyProtectedMember,PyUnresolvedReferences
RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
    'scratchings_latest_datetime', 'scratchings_latest_unixtime',
    'scratchings_close_datetime', 'scratchings_close_unixtime'))


def get_today_row(this_text):
    """
    Traverses the main table on the front page of https://racingaustralia.horse.
    This function scrapes Venue information and race day information.
    Unfortunately there is no clever way to split this function into two parts.
    :param this_text:
    :return RaceDay this_race_day:
    """
    this_soup = BeautifulSoup(this_text, 'html.parser')
    rows = this_soup.select('tr.rows')
    my_row = rows[2]
    cells = my_row.select('td')
    i = 0
    states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
    all_race_days = []
    day = 'Unknown'
    for cell in cells:
        if i == 0:
            # First cell contains date information
            day = cell.find('span').getText()
            # print("date: {}".format(day))
            i += 1
            continue
        venue_text = cell.find('p').getText().strip()
        if len(venue_text) > 0:
            # Cell is not empty
            print(venue_text)
            this_a = cell.findAll('a')  # .get('href')
            for a in this_a:
                # There may be several links in a cell (which represents a state)
                venue_name = a.getText().strip()
                this_venue = Venue(states[i - 1], venue_name)
                date_string = day
                this_url = a.get('href')
                if this_url:
                    # Create the Scratchings URL by substitution
                    scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
                    scratchings_url = base_url + scratchings_url
                    this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, scratchings_url)
                    all_race_days.append(this_race_day)
        i += 1
    return all_race_days


def get_meta_data(this_data, this_venue):
    """
    Meta data is on the top-right of the Scratchings page. It contains a date and time for
    the latest update as well as the closing of reporting of Scratchings.
    This function scrapes both dateTimes and converts to unixtime (which is timezone unaware)
    The RaceDay namedTuple is accordingly extended.
    :param this_data:
    :param this_venue:
    :return:
    """
    this_soup = BeautifulSoup(this_data, 'html.parser')
    this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
    last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
    close_regex = re.compile('Scratching close: (.+? AEST)')
    times = ['', 0, '', 0]
    time_format = '%a %d-%b-%y %I:%M%p'
    aest = timezone('Australia/Brisbane')
    if this_meta_data:
        this_meta_data = this_meta_data.getText()
        match = last_published_regex.search(this_meta_data)
        if match:
            # print(match[1])
            times[0] = match[1][:-5]
            # times[0] = 'Thu 20-Jun-19 7:42AM'
            l_time = datetime.datetime.strptime(times[0], time_format)
            # print(aest.localize(l_time))
            times[1] = model.convert_to_unixtime(aest.localize(l_time))
            # print(times[1])
        match = close_regex.search(this_meta_data)
        if match:
            # print(match[1])
            times[2] = match[1][:-5]
            l_time = datetime.datetime.strptime(times[2], time_format)
            # print(aest.localize(l_time))
            times[3] = model.convert_to_unixtime(aest.localize(l_time))
            # print(times[3])
    race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string, this_venue.scratchings_url,
                       times[0], times[1], times[2], times[3])
    return race_day