from bs4 import BeautifulSoup import re import datetime from pytz import timezone import model import collections # import pytz """ This module contains custom methods based on bs4.beautifulsoup to analyze data """ base_url = 'https://racingaustralia.horse/FreeFields/' Venue = collections.namedtuple('Venue', 'state, name') RaceDay = collections.namedtuple('RaceDay', Venue._fields + ('date_string', 'scratchings_url')) def get_today_row(this_text): this_soup = BeautifulSoup(this_text, 'html.parser') rows = this_soup.select('tr.rows') my_row = rows[2] cells = my_row.select('td') i = 0 states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT') all_race_days = [] day = 'Unknown' for cell in cells: if i == 0: day = cell.find('span').getText() # print("date: {}".format(day)) i += 1 continue venue_text = cell.find('p').getText().strip() if len(venue_text) > 0: # print("{}: {}".format(states[i-1], venue_text)) this_a = cell.findAll('a') # .get('href') for a in this_a: venue_name = a.getText().strip() this_venue = Venue(states[i - 1], venue_name) date_string = day this_url = a.get('href') scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url) scratchings_url = base_url + scratchings_url this_race_day = RaceDay(this_venue.state, this_venue.name, date_string, scratchings_url) all_race_days.append(this_race_day) i += 1 return all_race_days def get_meta_data(this_data): this_soup = BeautifulSoup(this_data, 'html.parser') this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0] last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)') close_regex = re.compile('Scratching close: (.+? AEST)') times = ['', 0, '', 0] time_format = '%a %d-%b-%y %I:%M%p' aest = timezone('Australia/Brisbane') if this_meta_data: this_meta_data = this_meta_data.getText() match = last_published_regex.search(this_meta_data) if match: print(match[1]) times[0] = match[1][:-5] # times[0] = 'Thu 20-Jun-19 7:42AM' l_time = datetime.datetime.strptime(times[0], time_format) # print(aest.localize(l_time)) times[1] = model.convert_to_unixtime(aest.localize(l_time)) print(times[1]) match = close_regex.search(this_meta_data) if match: print(match[1]) times[2] = match[1][:-5] l_time = datetime.datetime.strptime(times[2], time_format) # print(aest.localize(l_time)) times[3] = model.convert_to_unixtime(aest.localize(l_time)) print(times[3]) return times # print(this_meta_data)