Bladeren bron

Working on racenet races

Foppe Hemminga 6 jaren geleden
bovenliggende
commit
89e241a8e5
5 gewijzigde bestanden met toevoegingen van 114 en 4 verwijderingen
  1. 99 1
      _bs.py
  2. 2 1
      main.py
  3. 9 1
      model.py
  4. 1 1
      racenet.py
  5. 3 0
      races.py

+ 99 - 1
_bs.py

@@ -6,7 +6,9 @@ import model
 import collections
 from string import capwords
 # import pytz
-# from pprint import pprint
+from pprint import pprint
+import sys
+import arrow
 
 """
 This module contains custom methods based on bs4.beautifulsoup to analyze data
@@ -21,6 +23,7 @@ RaceDay = collections.namedtuple('RaceDay', RaceDayShort._fields + (
     'scratchings_close_datetime', 'scratchings_close_unixtime'))
 RawScratching = collections.namedtuple('RawScratching', 'venue state date race horse_no horse_display_name')
 Scratching = collections.namedtuple('Scratching', 'venue state date race time utc horse_no horse_display_name torn')
+RacenetRaces = collections.namedtuple('RacenetRaces', 'date venue_name state race_no local_time unix_time')
 
 
 def get_today_row(this_text, this_row):
@@ -219,3 +222,98 @@ def get_racenet_json(html):
         print('Failing in {}'.format("'script'"))
     # pprint(json)
     return json
+
+
+def get_racenet_races(html):
+    """
+    Analyzes the html from the races page and scrapes venue and race information
+    :param html html:
+    :return:
+    """
+    discard_non_tab = True
+    discard_barrier_trials = True
+    this_soup = BeautifulSoup(html, 'html.parser')
+    all_rows = []
+    tables = this_soup.find_all('table', class_='table-race-meetings')
+    venues = []
+    date_text = ''
+    venue_text = ''
+    venue_state = ''
+    print('{} tables found'.format(len(tables)))
+    regex_time = re.compile('(\d{2}:\d{2})')
+    regex_venue_state = re.compile('([ \w]+) \(([A-Z]{2,3})\)$')
+    if tables:
+        for table in tables:
+            body = None
+            if table:
+                # if "table-race-meetings--trials" in table.attrs['class']:
+                #     # print('This is a trial meeting')
+                #     continue
+                tab_panel = table.find_previous('div', id='meetinglist_tab_6')
+                if tab_panel:
+                    continue
+                date_div = table.find_previous('div', class_='race-meetings-section-header')
+                if date_div:
+                    venue_h2 = date_div.find('h2', class_='race-meetings-section-title')
+                    if venue_h2:
+                        venue_text = venue_h2.getText()
+                        if discard_barrier_trials and venue_text == 'Barrier Trials':
+                            continue
+                    date_span = date_div.find('span', class_='race-meetings-section-date')
+                    if date_span:
+                        date_text = date_span.getText()
+                        date_parsed = arrow.get(date_text, 'dddd DD MMMM YYYY')
+                    print('{} {} - {}'.format(date_parsed.date(), venue_text, date_text))
+                body = table.find('tbody')
+            else:
+                print('No `table` found')
+                # sys.exit(1)
+            if body:
+                all_rows = body.find_all('tr')
+            else:
+                print('No `body` found')
+                continue
+                # sys.exit(1)
+            for row in all_rows:
+                for td in row.find_all('td'):
+                    venue_selector = td.find('h3')
+                    if venue_selector:
+                        venue_name = venue_selector.get_text()
+                        venue_name = venue_name.strip()
+                        if discard_non_tab and 'Non-TAB' in venue_name:
+                            continue
+                        venue_name = re.sub('\nNon-TAB', '', venue_name)
+                        venue_name = venue_name.strip()
+                        venue_match = regex_venue_state.search(venue_name)
+                        if venue_match:
+                            venue_name = venue_match.group(1)
+                            venue_state = venue_match.group(2)
+                            if venue_state == 'NZ':
+                                venue_state = 'NZL'
+                        # venues.append(venue_name)
+                    else:
+                        # print('No `venue_selector` found')
+                        if td.get('class') and 'table-race-meeting-detail' in td.get('class'):
+                            # print(td.get('class'))
+                            time_string = td.find('span', class_='table-race-meeting-detail-info').getText()
+                            time_match = regex_time.search(time_string)
+                            if time_match:
+                                time_string = time_match.group(1)
+                            if time_string == 'TBA':
+                                continue
+                            race_number = td.get('data-race-number')[1:]
+                            start_time = td.get('data-start-time')
+                            # print("td.get('data-race-number'): {}". format(race_number))
+                            # print("td.get('data-start-time'): {}".format(start_time))
+                            # print("time_string: {}".format(time_string))
+                            # All data is collected so we can populate the namedTuple
+                            racenet_race = RacenetRaces(date_parsed.date(), venue_name, venue_state, race_number,
+                                                        time_string, start_time)
+                            venues.append(racenet_race)
+                        continue
+
+    else:
+        print('No `tables` found')
+        sys.exit(1)
+    pprint(venues)
+    print('{} venues found'.format(len(venues)))

+ 2 - 1
main.py

@@ -173,4 +173,5 @@ if __name__ == '__main__':
     db.close()
 
     interim = time.time()
-    # print('interim 3 {}'.format(interim - start))
+    if len(scratchings_to_be_broadcast) > 0:
+        print('interim 3 {}'.format(interim - start))

+ 9 - 1
model.py

@@ -39,7 +39,7 @@ def scrape_racingaustralia_main_page(row):
     return venues_all
 
 
-def scrape_racenet_main_page():
+def scrape_racenet_scratchings_page():
     this_url = """https://www.racenet.com.au/updates/scratchings"""
     this_data = _html.get_page(this_url)
     # print(this_data[:50])
@@ -47,6 +47,14 @@ def scrape_racenet_main_page():
     return json
 
 
+def scrape_racenet_races_page():
+    this_url = """https://www.racenet.com.au/racing-form-guide"""
+    this_data = _html.get_page(this_url)
+    # print(this_data[:50])
+    json = _bs.get_racenet_races(this_data)
+    # return json
+
+
 def get_raw_scratchings(this_venue):
     this_raw_data = _html.get_page(this_venue.scratchings_url)
     return this_raw_data

+ 1 - 1
racenet.py

@@ -11,7 +11,7 @@ import database
 
 # Data = collections.namedtuple('Data', 'venue state race time horse_no horse flag')
 
-my_json = model.scrape_racenet_main_page()
+my_json = model.scrape_racenet_scratchings_page()
 my_json = textwrap.fill(my_json[:-1], 1e6)
 # print(my_json)
 # exit(0)

+ 3 - 0
races.py

@@ -0,0 +1,3 @@
+import model
+
+model.scrape_racenet_races_page()