| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- from bs4 import BeautifulSoup
- import re
- import datetime
- from pytz import timezone
- import model
- import collections
- # import pytz
- """
- This module contains custom methods based on bs4.beautifulsoup to analyze data
- """
- base_url = 'https://racingaustralia.horse/FreeFields/'
- Venue = collections.namedtuple('Venue', 'state, name')
- RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'scratchings_url'))
- # noinspection PyProtectedMember,PyUnresolvedReferences
- RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
- 'scratchings_latest_datetime', 'scratchings_latest_unixtime',
- 'scratchings_close_datetime', 'scratchings_close_unixtime'))
- def get_today_row(this_text):
- """
- Traverses the main table on the front page of https://racingaustralia.horse.
- This function scrapes Venue information and race day information.
- Unfortunately there is no clever way to split this function into two parts.
- :param this_text:
- :return RaceDay this_race_day:
- """
- this_soup = BeautifulSoup(this_text, 'html.parser')
- rows = this_soup.select('tr.rows')
- my_row = rows[2]
- cells = my_row.select('td')
- i = 0
- states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
- all_race_days = []
- day = 'Unknown'
- for cell in cells:
- if i == 0:
- # First cell contains date information
- day = cell.find('span').getText()
- # print("date: {}".format(day))
- i += 1
- continue
- venue_text = cell.find('p').getText().strip()
- if len(venue_text) > 0:
- # Cell is not empty
- print(venue_text)
- this_a = cell.findAll('a') # .get('href')
- for a in this_a:
- # There may be several links in a cell (which represents a state)
- venue_name = a.getText().strip()
- this_venue = Venue(states[i - 1], venue_name)
- date_string = day
- this_url = a.get('href')
- if this_url:
- # Create the Scratchings URL by substitution
- scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
- scratchings_url = base_url + scratchings_url
- this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, scratchings_url)
- all_race_days.append(this_race_day)
- i += 1
- return all_race_days
- def get_meta_data(this_data, this_venue):
- """
- Meta data is on the top-right of the Scratchings page. It contains a date and time for
- the latest update as well as the closing of reporting of Scratchings.
- This function scrapes both dateTimes and converts to unixtime (which is timezone unaware)
- The RaceDay namedTuple is accordingly extended.
- :param this_data:
- :param this_venue:
- :return:
- """
- this_soup = BeautifulSoup(this_data, 'html.parser')
- this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
- last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
- close_regex = re.compile('Scratching close: (.+? AEST)')
- times = ['', 0, '', 0]
- time_format = '%a %d-%b-%y %I:%M%p'
- aest = timezone('Australia/Brisbane')
- if this_meta_data:
- this_meta_data = this_meta_data.getText()
- match = last_published_regex.search(this_meta_data)
- if match:
- # print(match[1])
- times[0] = match[1][:-5]
- # times[0] = 'Thu 20-Jun-19 7:42AM'
- l_time = datetime.datetime.strptime(times[0], time_format)
- # print(aest.localize(l_time))
- times[1] = model.convert_to_unixtime(aest.localize(l_time))
- # print(times[1])
- match = close_regex.search(this_meta_data)
- if match:
- # print(match[1])
- times[2] = match[1][:-5]
- l_time = datetime.datetime.strptime(times[2], time_format)
- # print(aest.localize(l_time))
- times[3] = model.convert_to_unixtime(aest.localize(l_time))
- # print(times[3])
- race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string, this_venue.scratchings_url,
- times[0], times[1], times[2], times[3])
- return race_day
|