_bs.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. from bs4 import BeautifulSoup
  2. import re
  3. import datetime
  4. from pytz import timezone
  5. import model
  6. import collections
  7. # import pytz
  8. """
  9. This module contains custom methods based on bs4.beautifulsoup to analyze data
  10. """
  11. base_url = 'https://racingaustralia.horse/FreeFields/'
  12. Venue = collections.namedtuple('Venue', 'state, name')
  13. RaceDayShort = collections.namedtuple('RaceDayShort', Venue._fields + ('date_string', 'scratchings_url'))
  14. # noinspection PyProtectedMember,PyUnresolvedReferences
  15. RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
  16. 'scratchings_latest_datetime', 'scratchings_latest_unixtime',
  17. 'scratchings_close_datetime', 'scratchings_close_unixtime'))
  18. def get_today_row(this_text):
  19. """
  20. Traverses the main table on the front page of https://racingaustralia.horse.
  21. This function scrapes Venue information and race day information.
  22. Unfortunately there is no clever way to split this function into two parts.
  23. :param this_text:
  24. :return RaceDay this_race_day:
  25. """
  26. this_soup = BeautifulSoup(this_text, 'html.parser')
  27. rows = this_soup.select('tr.rows')
  28. my_row = rows[2]
  29. cells = my_row.select('td')
  30. i = 0
  31. states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
  32. all_race_days = []
  33. day = 'Unknown'
  34. for cell in cells:
  35. if i == 0:
  36. # First cell contains date information
  37. day = cell.find('span').getText()
  38. # print("date: {}".format(day))
  39. i += 1
  40. continue
  41. venue_text = cell.find('p').getText().strip()
  42. if len(venue_text) > 0:
  43. # Cell is not empty
  44. print(venue_text)
  45. this_a = cell.findAll('a') # .get('href')
  46. for a in this_a:
  47. # There may be several links in a cell (which represents a state)
  48. venue_name = a.getText().strip()
  49. this_venue = Venue(states[i - 1], venue_name)
  50. date_string = day
  51. this_url = a.get('href')
  52. if this_url:
  53. # Create the Scratchings URL by substitution
  54. scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
  55. scratchings_url = base_url + scratchings_url
  56. this_race_day = RaceDayShort(this_venue.state, this_venue.name, date_string, scratchings_url)
  57. all_race_days.append(this_race_day)
  58. i += 1
  59. return all_race_days
  60. def get_meta_data(this_data, this_venue):
  61. """
  62. Meta data is on the top-right of the Scratchings page. It contains a date and time for
  63. the latest update as well as the closing of reporting of Scratchings.
  64. This function scrapes both dateTimes and converts to unixtime (which is timezone unaware)
  65. The RaceDay namedTuple is accordingly extended.
  66. :param this_data:
  67. :param this_venue:
  68. :return:
  69. """
  70. this_soup = BeautifulSoup(this_data, 'html.parser')
  71. this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
  72. last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
  73. close_regex = re.compile('Scratching close: (.+? AEST)')
  74. times = ['', 0, '', 0]
  75. time_format = '%a %d-%b-%y %I:%M%p'
  76. aest = timezone('Australia/Brisbane')
  77. if this_meta_data:
  78. this_meta_data = this_meta_data.getText()
  79. match = last_published_regex.search(this_meta_data)
  80. if match:
  81. # print(match[1])
  82. times[0] = match[1][:-5]
  83. # times[0] = 'Thu 20-Jun-19 7:42AM'
  84. l_time = datetime.datetime.strptime(times[0], time_format)
  85. # print(aest.localize(l_time))
  86. times[1] = model.convert_to_unixtime(aest.localize(l_time))
  87. # print(times[1])
  88. match = close_regex.search(this_meta_data)
  89. if match:
  90. # print(match[1])
  91. times[2] = match[1][:-5]
  92. l_time = datetime.datetime.strptime(times[2], time_format)
  93. # print(aest.localize(l_time))
  94. times[3] = model.convert_to_unixtime(aest.localize(l_time))
  95. # print(times[3])
  96. race_day = RaceDay(this_venue.state, this_venue.name, this_venue.date_string, this_venue.scratchings_url,
  97. times[0], times[1], times[2], times[3])
  98. return race_day