_bs.py 3.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. from bs4 import BeautifulSoup
  2. import re
  3. # import datetime
  4. # from pytz import timezone
  5. import model
  6. import collections
  7. # # import pytz
  8. # from pprint import pprint
  9. """
  10. This module contains custom methods based on bs4.beautifulsoup to analyze data
  11. """
  12. base_url = 'https://racingaustralia.horse/FreeFields/'
  13. Venue = collections.namedtuple('Venue', 'state, name')
  14. RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'date', 'program_url'))
  15. # noinspection PyProtectedMember,PyUnresolvedReferences
  16. RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
  17. 'scratchings_latest_datetime', 'scratchings_latest_unixtime',
  18. 'scratchings_close_datetime', 'scratchings_close_unixtime'))
  19. Scratching = collections.namedtuple('Scratching', 'venue state date race horse')
  20. def get_today_row(this_text, this_row):
  21. """
  22. Traverses the main table on the front page of https://racingaustralia.horse.
  23. This function scrapes Venue information and race day information.
  24. Unfortunately there is no clever way to split this function into two parts.
  25. :param this_text:
  26. :param this_row:
  27. :return RaceDay this_race_day:
  28. """
  29. this_soup = BeautifulSoup(this_text, 'html.parser')
  30. rows = this_soup.select('tr.rows')
  31. # print('len(rows) {}'.format(len(rows)))
  32. all_race_days = []
  33. days_to_check = [this_row]
  34. if this_row == -1:
  35. days_to_check = range(len(rows))
  36. for day in days_to_check:
  37. my_row = rows[day]
  38. cells = my_row.select('td')
  39. i = 0
  40. states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
  41. day = 'Unknown'
  42. for cell in cells:
  43. if i == 0:
  44. # First cell contains date information
  45. day = cell.find('span').getText()
  46. # print("date: {}".format(day))
  47. i += 1
  48. continue
  49. venue_text = cell.find('p').getText().strip()
  50. if len(venue_text) > 0:
  51. # Cell is not empty
  52. # print(venue_text)
  53. this_a = cell.findAll('a') # .get('href')
  54. for a in this_a:
  55. # There may be several links in a cell (which represents a state)
  56. venue_name = a.getText().strip()
  57. this_venue = Venue(states[i - 1], venue_name)
  58. date_string = day
  59. this_url = a.get('href')
  60. if this_url:
  61. # Create the Scratchings URL by substitution
  62. program_url = re.sub(r"/(.*)\.aspx", 'RaceProgram.aspx', this_url)
  63. program_url = base_url + program_url
  64. calculated_date = model.convert_to_date(date_string)
  65. this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string,
  66. calculated_date, program_url)
  67. all_race_days.append(this_race_day)
  68. i += 1
  69. return all_race_days
  70. def separate_races(program_html):
  71. """
  72. Get the description line for each of the races from the html
  73. :rtype: object
  74. :param program_html:
  75. :return:
  76. """
  77. this_soup = BeautifulSoup(program_html, 'html.parser')
  78. table_blocks = this_soup.select('table.race-title')
  79. # print(len(table_blocks))
  80. races = []
  81. for table in table_blocks:
  82. titles = table.select('a.race-title-anchor-3')
  83. for title in titles:
  84. this_line = title.getText()
  85. race_match = re.search(r'^Race (\d+) - (\d{1,2}:\d{2}[AP]M) ', this_line)
  86. if race_match:
  87. races.append((race_match.group(1), race_match.group(2)))
  88. return races