_bs.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. from bs4 import BeautifulSoup
  2. import re
  3. import datetime
  4. from pytz import timezone
  5. import model
  6. import collections
  7. # import pytz
  8. """
  9. This module contains custom methods based on bs4.beautifulsoup to analyze data
  10. """
  11. base_url = 'https://racingaustralia.horse/FreeFields/'
  12. Venue = collections.namedtuple('Venue', 'state, name')
  13. RaceDay = collections.namedtuple('RaceDay', Venue._fields + ('date_string', 'scratchings_url'))
  14. def get_today_row(this_text):
  15. this_soup = BeautifulSoup(this_text, 'html.parser')
  16. rows = this_soup.select('tr.rows')
  17. my_row = rows[2]
  18. cells = my_row.select('td')
  19. i = 0
  20. states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
  21. all_race_days = []
  22. day = 'Unknown'
  23. for cell in cells:
  24. if i == 0:
  25. day = cell.find('span').getText()
  26. # print("date: {}".format(day))
  27. i += 1
  28. continue
  29. venue_text = cell.find('p').getText().strip()
  30. if len(venue_text) > 0:
  31. # print("{}: {}".format(states[i-1], venue_text))
  32. this_a = cell.findAll('a') # .get('href')
  33. for a in this_a:
  34. venue_name = a.getText().strip()
  35. this_venue = Venue(states[i - 1], venue_name)
  36. date_string = day
  37. this_url = a.get('href')
  38. scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
  39. scratchings_url = base_url + scratchings_url
  40. this_race_day = RaceDay(this_venue.state, this_venue.name, date_string, scratchings_url)
  41. all_race_days.append(this_race_day)
  42. i += 1
  43. return all_race_days
  44. def get_meta_data(this_data):
  45. this_soup = BeautifulSoup(this_data, 'html.parser')
  46. this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
  47. last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
  48. close_regex = re.compile('Scratching close: (.+? AEST)')
  49. times = ['', 0, '', 0]
  50. time_format = '%a %d-%b-%y %I:%M%p'
  51. aest = timezone('Australia/Brisbane')
  52. if this_meta_data:
  53. this_meta_data = this_meta_data.getText()
  54. match = last_published_regex.search(this_meta_data)
  55. if match:
  56. print(match[1])
  57. times[0] = match[1][:-5]
  58. # times[0] = 'Thu 20-Jun-19 7:42AM'
  59. l_time = datetime.datetime.strptime(times[0], time_format)
  60. # print(aest.localize(l_time))
  61. times[1] = model.convert_to_unixtime(aest.localize(l_time))
  62. print(times[1])
  63. match = close_regex.search(this_meta_data)
  64. if match:
  65. print(match[1])
  66. times[2] = match[1][:-5]
  67. l_time = datetime.datetime.strptime(times[2], time_format)
  68. # print(aest.localize(l_time))
  69. times[3] = model.convert_to_unixtime(aest.localize(l_time))
  70. print(times[3])
  71. return times
  72. # print(this_meta_data)