_bs.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. from bs4 import BeautifulSoup
  2. import re
  3. import datetime
  4. from pytz import timezone
  5. import model
  6. import collections
  7. # import pytz
  8. from pprint import pprint
  9. """
  10. This module contains custom methods based on bs4.beautifulsoup to analyze data
  11. """
  12. base_url = 'https://racingaustralia.horse/FreeFields/'
  13. Venue = collections.namedtuple('Venue', 'state, name')
  14. RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'scratchings_url'))
  15. # noinspection PyProtectedMember,PyUnresolvedReferences
  16. RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
  17. 'scratchings_latest_datetime', 'scratchings_latest_unixtime',
  18. 'scratchings_close_datetime', 'scratchings_close_unixtime'))
  19. Scratching = collections.namedtuple('Scratching', 'venue state date race horse')
  20. def get_today_row(this_text, this_row):
  21. """
  22. Traverses the main table on the front page of https://racingaustralia.horse.
  23. This function scrapes Venue information and race day information.
  24. Unfortunately there is no clever way to split this function into two parts.
  25. :param this_text:
  26. :param this_row:
  27. :return RaceDay this_race_day:
  28. """
  29. this_soup = BeautifulSoup(this_text, 'html.parser')
  30. rows = this_soup.select('tr.rows')
  31. # print('len(rows) {}'.format(len(rows)))
  32. all_race_days = []
  33. days_to_check = [this_row]
  34. if this_row == -1:
  35. days_to_check = range(len(rows))
  36. for day in days_to_check:
  37. my_row = rows[day]
  38. cells = my_row.select('td')
  39. i = 0
  40. states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
  41. day = 'Unknown'
  42. for cell in cells:
  43. if i == 0:
  44. # First cell contains date information
  45. day = cell.find('span').getText()
  46. # print("date: {}".format(day))
  47. i += 1
  48. continue
  49. venue_text = cell.find('p').getText().strip()
  50. if len(venue_text) > 0:
  51. # Cell is not empty
  52. # print(venue_text)
  53. this_a = cell.findAll('a') # .get('href')
  54. for a in this_a:
  55. # There may be several links in a cell (which represents a state)
  56. venue_name = a.getText().strip()
  57. this_venue = Venue(states[i - 1], venue_name)
  58. date_string = day
  59. this_url = a.get('href')
  60. if this_url:
  61. # Create the Scratchings URL by substitution
  62. scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
  63. scratchings_url = base_url + scratchings_url
  64. calculated_date = model.convert_to_date(date_string)
  65. this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
  66. calculated_date.strftime('%Y-%m-%d'), scratchings_url)
  67. all_race_days.append(this_race_day)
  68. i += 1
  69. return all_race_days
  70. def get_meta_data(this_data, this_venue):
  71. """
  72. Meta data is on the top-right of the Scratchings page. It contains a date and time for
  73. the latest update as well as the closing of reporting of Scratchings.
  74. This function scrapes both dateTimes and converts to unixtime (which is timezone unaware)
  75. The RaceDay namedTuple is accordingly extended.
  76. :param this_data:
  77. :param this_venue:
  78. :return:
  79. """
  80. global print_it
  81. this_soup = BeautifulSoup(this_data, 'html.parser')
  82. early = this_soup.select('div.large')
  83. # if early:
  84. # print(early.get_text())
  85. if early and 'not currently available' in early.get_text():
  86. # print(early.get_text())
  87. return
  88. try:
  89. this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
  90. except IndexError:
  91. return
  92. last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
  93. close_regex = re.compile('Scratching close: (.+? AEST)')
  94. # The times tuple is filled with a dateTime string then a unixtime (seconds since 1970)
  95. times = ['', 0, '', 0]
  96. time_format = '%a %d-%b-%y %I:%M%p'
  97. aest = timezone('Australia/Brisbane')
  98. if this_meta_data:
  99. this_meta_data = this_meta_data.getText()
  100. match = last_published_regex.search(this_meta_data)
  101. if match:
  102. # print(this_venue.name)
  103. # pprint(match)
  104. times[0] = match.group(1)[:-5]
  105. # times[0] = 'Thu 20-Jun-19 7:42AM'
  106. l_time = datetime.datetime.strptime(times[0], time_format)
  107. # print(aest.localize(l_time))
  108. times[1] = model.convert_to_unixtime(aest.localize(l_time))
  109. # print(times[1])
  110. match = close_regex.search(this_meta_data)
  111. utctime = datetime.datetime.utcnow()
  112. if (utctime.hour in (15, 16, 17, 18) and utctime.minute > 49) or \
  113. (utctime.hour in (16, 17, 18, 19) and utctime.minute < 11):
  114. print_it = True
  115. print('utctime: {}'.format(utctime))
  116. if match:
  117. print_it = False
  118. if '-19 ' not in match.group(1):
  119. print_it = True
  120. if print_it or not match.group(1):
  121. print_it = True
  122. print('match.group(1): {}'.format(match.group(1)))
  123. times[2] = match.group(1)[:-5]
  124. l_time = datetime.datetime.strptime(times[2], time_format)
  125. if print_it:
  126. print('match.group(1)[:5], {}'.format(match.group(1)[:5]))
  127. print('l_time: {}'.format(l_time))
  128. print('aest.localize(l_time); {}'.format(aest.localize(l_time)))
  129. times[3] = model.convert_to_unixtime(aest.localize(l_time))
  130. if print_it or times[3] < 1e6:
  131. print('times[3]: {}'.format(times[3]))
  132. print_it = True
  133. # print(times[3])
  134. if print_it or times[3] < 1e6:
  135. print_it = True
  136. print('datetime.date.fromtimestamp(times[3]+12*60*60): {}'.format(
  137. datetime.date.fromtimestamp(times[3] + 12 * 60 * 60)))
  138. elif print_it:
  139. print('this_meta_data: {}'.format(this_meta_data))
  140. print('No match for regex: {}'.format(close_regex))
  141. pprint(times)
  142. # The RaceDAy namedTuple is created and filled
  143. race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string,
  144. datetime.date.fromtimestamp(times[3] + 12 * 60 * 60),
  145. this_venue.scratchings_url, times[0], times[1], times[2], times[3])
  146. return race_day
  147. def scrape_scratchings(div, this_venue):
  148. old_race = 0
  149. race = 0
  150. scraped_scratchings = []
  151. for text in div.stripped_strings:
  152. if text[:5] == 'Race ':
  153. match = re.search('^Race ([0-9]+):$', text)
  154. if match:
  155. try:
  156. race = int(match.group(1))
  157. except ValueError:
  158. # This will happily fail in the next assert
  159. race = 0
  160. assert race > old_race, 'race {} ! > old_race {}'.format(race, old_race)
  161. old_race = race
  162. continue
  163. if text[0] == '(':
  164. continue
  165. if len(text) > 0:
  166. if text[0:10] == 'There are ':
  167. continue
  168. try:
  169. int(text[0])
  170. except ValueError:
  171. print('First character in line: {}'.format(text[0]))
  172. print('The start of the offending line is: {}'.format(text[0:10]))
  173. continue
  174. text = re.sub(r'e\s+', ' ', text)
  175. temp_list = Scratching(this_venue.name, this_venue.state, this_venue.date, race, text)
  176. scraped_scratchings.append(temp_list)
  177. return scraped_scratchings
  178. def process_scratchings(this_data, this_venue):
  179. this_soup = BeautifulSoup(this_data, 'html.parser')
  180. try:
  181. this_scr = this_soup.select('div.scratchings')[0]
  182. except IndexError:
  183. return
  184. scratchings_count = this_scr.select('table')[0].select('tr')[2].select('td')[3].getText()
  185. # print('{}: scratchings_count {}'.format(this_venue.name, scratchings_count))
  186. header = this_scr.findAll('h3', text=re.compile('Scratchings'))[0]
  187. div = header.findNext('table')
  188. scratchings = set()
  189. early_scratchings = scrape_scratchings(div, this_venue)
  190. scratchings.update(early_scratchings)
  191. header = this_scr.findAll('h3', text=re.compile('Late Scratchings'))[0]
  192. late_div = header.findNext('table')
  193. late_scratchings = scrape_scratchings(late_div, this_venue)
  194. # if this_venue.name == 'Corowa':
  195. # pprint(late_div)
  196. # pprint(late_scratchings)
  197. scratchings.update(late_scratchings)
  198. assert len(scratchings) == int(scratchings_count), 'len(scratchings) {} == scratchings_count {}'.format(
  199. len(scratchings), scratchings_count)
  200. # if len(scratchings) != int(scratchings_count):
  201. # print('len(scratchings) {} == scratchings_count {}'.format(
  202. # len(scratchings), scratchings_count))
  203. # pprint(scratchings)
  204. return scratchings