_bs.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. from bs4 import BeautifulSoup
  2. import re
  3. import datetime
  4. from pytz import timezone
  5. import model
  6. import collections
  7. from string import capwords
  8. # import pytz
  9. from pprint import pprint
  10. import sys
  11. import arrow
  12. """
  13. This module contains custom methods based on bs4.beautifulsoup to analyze data
  14. """
  15. base_url = 'https://racingaustralia.horse/FreeFields/'
  16. Venue = collections.namedtuple('Venue', 'state, name')
  17. RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'scratchings_url'))
  18. # noinspection PyProtectedMember,PyUnresolvedReferences
  19. RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
  20. 'scratchings_latest_datetime', 'scratchings_latest_unixtime',
  21. 'scratchings_close_datetime', 'scratchings_close_unixtime'))
  22. RawScratching = collections.namedtuple('RawScratching', 'venue state date race horse_no horse_display_name')
  23. Scratching = collections.namedtuple('Scratching', 'venue state date race time utc horse_no horse_display_name torn')
  24. RacenetRaces = collections.namedtuple('RacenetRaces', 'race_date venue state race start_time utctime')
  25. def get_today_row(this_text, this_row):
  26. """
  27. Traverses the main table on the front page of https://racingaustralia.horse.
  28. This function scrapes Venue information and race day information.
  29. Unfortunately there is no clever way to split this function into two parts.
  30. :param this_text:
  31. :param this_row:
  32. :return RaceDay this_race_day:
  33. """
  34. this_soup = BeautifulSoup(this_text, 'html.parser')
  35. rows = this_soup.select('tr.rows')
  36. # print('len(rows) {}'.format(len(rows)))
  37. all_race_days = []
  38. days_to_check = [this_row]
  39. if this_row == -1:
  40. days_to_check = range(len(rows))
  41. for day in days_to_check:
  42. my_row = rows[day]
  43. cells = my_row.select('td')
  44. i = 0
  45. states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
  46. day = 'Unknown'
  47. for cell in cells:
  48. if i == 0:
  49. # First cell contains date information
  50. day = cell.find('span').getText()
  51. # print("date: {}".format(day))
  52. i += 1
  53. continue
  54. venue_text = cell.find('p').getText().strip()
  55. if len(venue_text) > 0:
  56. # Cell is not empty
  57. # print(venue_text)
  58. this_a = cell.findAll('a') # .get('href')
  59. for a in this_a:
  60. # There may be several links in a cell (which represents a state)
  61. venue_name = a.getText().strip()
  62. this_venue = Venue(states[i - 1], venue_name)
  63. date_string = day
  64. this_url = a.get('href')
  65. if this_url:
  66. # Create the Scratchings URL by substitution
  67. scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
  68. scratchings_url = base_url + scratchings_url
  69. calculated_date = model.convert_to_date(date_string)
  70. this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
  71. calculated_date, scratchings_url)
  72. all_race_days.append(this_race_day)
  73. i += 1
  74. return all_race_days
  75. def get_meta_data(this_data, this_venue):
  76. """
  77. Meta data is on the top-right of the Scratchings page. It contains a date and time for
  78. the latest update as well as the closing of reporting of Scratchings.
  79. This function scrapes both dateTimes and converts to unixtime (which is timezone unaware)
  80. The RaceDay namedTuple is accordingly extended.
  81. :param this_data:
  82. :param this_venue:
  83. :return:
  84. """
  85. this_soup = BeautifulSoup(this_data, 'html.parser')
  86. early = this_soup.select('div.large')
  87. # if early:
  88. # print(early.get_text())
  89. if early and 'not currently available' in early.get_text():
  90. # print(early.get_text())
  91. return
  92. try:
  93. this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
  94. except IndexError:
  95. return
  96. last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
  97. close_regex = re.compile('Scratching close: (.+? AEST)')
  98. # The times tuple is filled with a dateTime string then a unixtime (seconds since 1970)
  99. times = ['', 0, '', 0]
  100. time_format = '%a %d-%b-%y %I:%M%p'
  101. aest = timezone('Australia/Brisbane')
  102. if this_meta_data:
  103. this_meta_data = this_meta_data.getText()
  104. match = last_published_regex.search(this_meta_data)
  105. if match:
  106. # print(this_venue.name)
  107. # pprint(match)
  108. times[0] = match.group(1)[:-5]
  109. # times[0] = 'Thu 20-Jun-19 7:42AM'
  110. l_time = datetime.datetime.strptime(times[0], time_format)
  111. # print(aest.localize(l_time))
  112. times[1] = model.convert_to_unixtime(aest.localize(l_time))
  113. # print(times[1])
  114. match = close_regex.search(this_meta_data)
  115. if match:
  116. times[2] = match.group(1)[:-5]
  117. l_time = datetime.datetime.strptime(times[2], time_format)
  118. times[3] = model.convert_to_unixtime(aest.localize(l_time))
  119. # The RaceDAy namedTuple is created and filled
  120. race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string,
  121. this_venue.date, this_venue.scratchings_url,
  122. times[0], times[1], times[2], times[3])
  123. return race_day
  124. def scrape_scratchings(div, this_venue):
  125. old_race = 0
  126. race = 0
  127. scraped_scratchings = []
  128. for text in div.stripped_strings:
  129. if text[:5] == 'Race ':
  130. match = re.search('^Race ([0-9]+):$', text)
  131. if match:
  132. try:
  133. race = int(match.group(1))
  134. except ValueError:
  135. # This will happily fail in the next assert
  136. race = 0
  137. assert race > old_race, 'race {} ! > old_race {}'.format(race, old_race)
  138. old_race = race
  139. continue
  140. if text[0] == '(':
  141. continue
  142. if len(text) > 0:
  143. if text[0:10] == 'There are ':
  144. continue
  145. try:
  146. int(text[0])
  147. except ValueError:
  148. print('First character in line: {}'.format(text[0]))
  149. print('The start of the offending line is: {}'.format(text[0:10]))
  150. continue
  151. match = re.search(r'^(\d{1,2})e?\s+(.+)', text)
  152. no = 0
  153. name = ''
  154. if match:
  155. no = int(match.group(1))
  156. name = capwords(match.group(2))
  157. name = re.sub(r' Of ', ' of ', name)
  158. if name.endswith('(nz)'):
  159. name = name[:-len(' (nz)')]
  160. temp_list = RawScratching(this_venue.name, this_venue.state, this_venue.date, race, no, name)
  161. scraped_scratchings.append(temp_list)
  162. return scraped_scratchings
  163. def process_scratchings(this_data, this_venue):
  164. this_soup = BeautifulSoup(this_data, 'html.parser')
  165. try:
  166. this_scr = this_soup.select('div.scratchings')[0]
  167. except IndexError:
  168. return
  169. scratchings_count = this_scr.select('table')[0].select('tr')[2].select('td')[3].getText()
  170. # print('{}: scratchings_count {}'.format(this_venue.name, scratchings_count))
  171. header = this_scr.findAll('h3', text=re.compile('Scratchings'))[0]
  172. div = header.findNext('table')
  173. scratchings = set()
  174. early_scratchings = scrape_scratchings(div, this_venue)
  175. scratchings.update(early_scratchings)
  176. # print('len(scratchings): {}'.format(len(scratchings)))
  177. header = this_scr.findAll('h3', text=re.compile('Late Scratchings'))[0]
  178. late_div = header.findNext('table')
  179. late_scratchings = scrape_scratchings(late_div, this_venue)
  180. # if this_venue.name == 'Corowa':
  181. # pprint(late_div)
  182. # pprint(late_scratchings)
  183. scratchings.update(late_scratchings)
  184. # print('len(scratchings): {}'.format(len(scratchings)))
  185. assert len(scratchings) == int(scratchings_count), 'len(scratchings) {} == scratchings_count {}'.format(
  186. len(scratchings), scratchings_count)
  187. # if len(scratchings) != int(scratchings_count):
  188. # print('len(scratchings) {} == scratchings_count {}'.format(
  189. # len(scratchings), scratchings_count))
  190. # pprint(scratchings)
  191. return scratchings
  192. def get_racenet_json(html):
  193. this_soup = BeautifulSoup(html, 'html.parser')
  194. pattern = re.compile(r'window\.initialReduxState = (.*)')
  195. script = this_soup.find('script', text=pattern)
  196. json = '{}'
  197. if script:
  198. # print('script')
  199. match = pattern.search(script.text)
  200. if match:
  201. # print('match')
  202. json = match.group(1)
  203. else:
  204. print('Failing in {}'.format("'match'"))
  205. else:
  206. print('Failing in {}'.format("'script'"))
  207. # pprint(json)
  208. return json
  209. def get_racenet_races(html):
  210. """
  211. Analyzes the html from the races page and scrapes venue and race information
  212. :param html html:
  213. :return:
  214. """
  215. discard_non_tab = True
  216. discard_barrier_trials = True
  217. this_soup = BeautifulSoup(html, 'html.parser')
  218. tables = this_soup.find_all('table', class_='table-race-meetings')
  219. venues = []
  220. date_text = ''
  221. venue_text = ''
  222. venue_state = ''
  223. race_number = ''
  224. date_parsed = arrow.get('Wednesday 14 August 2019', 'dddd DD MMMM YYYY')
  225. venue_name = ''
  226. print('{} tables found'.format(len(tables)))
  227. regex_time = re.compile('([\d]{2}:[\d]{2})')
  228. regex_venue_state = re.compile('([ \w]+) \(([A-Z]{2,3})\)$')
  229. if tables:
  230. for table in tables:
  231. body = None
  232. if table:
  233. # if "table-race-meetings--trials" in table.attrs['class']:
  234. # # print('This is a trial meeting')
  235. # continue
  236. tab_panel = table.find_previous('div', id='meetinglist_tab_6')
  237. if tab_panel:
  238. continue
  239. date_div = table.find_previous('div', class_='race-meetings-section-header')
  240. if date_div:
  241. venue_h2 = date_div.find('h2', class_='race-meetings-section-title')
  242. if venue_h2:
  243. venue_text = venue_h2.getText()
  244. if discard_barrier_trials and 'Barrier Trials' in venue_text:
  245. print('Skipped `Barrier Trials` {}'.format(venue_text))
  246. continue
  247. date_span = date_div.find('span', class_='race-meetings-section-date')
  248. if date_span:
  249. date_text = date_span.getText()
  250. date_parsed = arrow.get(date_text, 'dddd DD MMMM YYYY')
  251. print('{} {} - {}'.format(date_parsed.date(), venue_text, date_text))
  252. body = table.find('tbody')
  253. else:
  254. print('No `table` found')
  255. # sys.exit(1)
  256. if body:
  257. all_rows = body.find_all('tr')
  258. else:
  259. print('No `body` found')
  260. continue
  261. # sys.exit(1)
  262. for row in all_rows:
  263. for td in row.find_all('td'):
  264. venue_selector = td.find('h3')
  265. if venue_selector:
  266. venue_name = venue_selector.get_text()
  267. venue_name = venue_name.strip()
  268. if discard_non_tab and 'Non-TAB' in venue_name:
  269. print('Skipped Non-TAB {}'.format(venue_name))
  270. continue
  271. venue_name = re.sub('\nNon-TAB', '', venue_name)
  272. venue_name = venue_name.strip()
  273. venue_match = regex_venue_state.search(venue_name)
  274. if venue_match:
  275. venue_name = venue_match.group(1)
  276. venue_state = venue_match.group(2)
  277. if venue_state == 'NZ':
  278. venue_state = 'NZL'
  279. # venues.append(venue_name)
  280. else:
  281. # print('No `venue_selector` found')
  282. if td.get('class') and 'table-race-meeting-detail' in td.get('class'):
  283. # print(td.get('class'))
  284. time_string = td.find('span', class_='table-race-meeting-detail-info').getText()
  285. time_match = regex_time.search(time_string)
  286. if time_match:
  287. time_string = time_match.group(1)
  288. if time_string == 'TBA':
  289. continue
  290. race_number_padded = td.get('data-race-number')
  291. if race_number_padded:
  292. race_number = td.get('data-race-number')[1:]
  293. else:
  294. # print("td.get('data-race-number'): {}".format(race_number_padded))
  295. continue
  296. start_time = td.get('data-start-time')
  297. # print(start_time)
  298. time_string_with_date = date_parsed.format('YYYY-MM-DD')+' '+time_string
  299. # print(time_string_with_date)
  300. local_time = arrow.get(time_string_with_date, 'YYYY-MM-DD HH:mm').time()
  301. utc_time = arrow.get(int(start_time)/1000).datetime
  302. # print("td.get('data-race-number'): {}". format(race_number))
  303. # print("td.get('data-start-time'): {}".format(start_time))
  304. # print("time_string: {}".format(time_string))
  305. # All data is collected so we can populate the namedTuple
  306. racenet_race = RacenetRaces(date_parsed.date(), venue_name, venue_state, race_number,
  307. local_time, utc_time)
  308. venues.append(racenet_race)
  309. continue
  310. else:
  311. print('No `tables` found')
  312. sys.exit(1)
  313. pprint(venues)
  314. print('{} venues found'.format(len(venues)))
  315. return venues