hace 6 años · e84d7881ee
--- a/.idea/dictionaries/foppe.xml
+++ b/.idea/dictionaries/foppe.xml
@@ -0,0 +1,10 @@
 
																+<component name="ProjectDictionaryState">
															
 
																+  <dictionary name="foppe">
															
 
																+    <words>
															
 
																+      <w>aest</w>
															
 
																+      <w>beautifulsoup</w>
															
 
																+      <w>pytz</w>
															
 
																+      <w>unixtime</w>
															
 
																+    </words>
															
 
																+  </dictionary>
															
 
																+</component>
															
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,10 @@
 
																+<?xml version="1.0" encoding="UTF-8"?>
															
 
																+<project version="4">
															
 
																+  <component name="JavaScriptSettings">
															
 
																+    <option name="languageLevel" value="ES6" />
															
 
																+  </component>
															
 
																+  <component name="NodePackageJsonFileManager">
															
 
																+    <packageJsonPaths />
															
 
																+  </component>
															
 
																+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (scratchings)" project-jdk-type="Python SDK" />
															
 
																+</project>
															
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
 
																+<?xml version="1.0" encoding="UTF-8"?>
															
 
																+<project version="4">
															
 
																+  <component name="ProjectModuleManager">
															
 
																+    <modules>
															
 
																+      <module fileurl="file://$PROJECT_DIR$/.idea/scratchings.iml" filepath="$PROJECT_DIR$/.idea/scratchings.iml" />
															
 
																+    </modules>
															
 
																+  </component>
															
 
																+</project>
															
--- a/.idea/scratchings.iml
+++ b/.idea/scratchings.iml
@@ -0,0 +1,13 @@
 
																+<?xml version="1.0" encoding="UTF-8"?>
															
 
																+<module type="PYTHON_MODULE" version="4">
															
 
																+  <component name="NewModuleRootManager">
															
 
																+    <content url="file://$MODULE_DIR$">
															
 
																+      <excludeFolder url="file://$MODULE_DIR$/venv" />
															
 
																+    </content>
															
 
																+    <orderEntry type="inheritedJdk" />
															
 
																+    <orderEntry type="sourceFolder" forTests="false" />
															
 
																+  </component>
															
 
																+  <component name="TestRunnerService">
															
 
																+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
															
 
																+  </component>
															
 
																+</module>
															
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
 
																+<?xml version="1.0" encoding="UTF-8"?>
															
 
																+<project version="4">
															
 
																+  <component name="VcsDirectoryMappings">
															
 
																+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
															
 
																+  </component>
															
 
																+</project>
															
--- a/_bs.py
+++ b/_bs.py
@@ -0,0 +1,79 @@
 
																+from bs4 import BeautifulSoup
															
 
																+import re
															
 
																+import datetime
															
 
																+from pytz import timezone
															
 
																+import model
															
 
																+import collections
															
 
																+# import pytz
															
 
																+
															
 
																+
															
 
																+"""
															
 
																+This module contains custom methods based on bs4.beautifulsoup to analyze data
															
 
																+"""
															
 
																+
															
 
																+base_url = 'https://racingaustralia.horse/FreeFields/'
															
 
																+Venue = collections.namedtuple('Venue', 'state, name')
															
 
																+RaceDay = collections.namedtuple('RaceDay', Venue._fields + ('date_string', 'scratchings_url'))
															
 
																+
															
 
																+
															
 
																+def get_today_row(this_text):
															
 
																+    this_soup = BeautifulSoup(this_text, 'html.parser')
															
 
																+    rows = this_soup.select('tr.rows')
															
 
																+    my_row = rows[2]
															
 
																+    cells = my_row.select('td')
															
 
																+    i = 0
															
 
																+    states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
															
 
																+    all_race_days = []
															
 
																+    day = 'Unknown'
															
 
																+    for cell in cells:
															
 
																+        if i == 0:
															
 
																+            day = cell.find('span').getText()
															
 
																+            # print("date: {}".format(day))
															
 
																+            i += 1
															
 
																+            continue
															
 
																+        venue_text = cell.find('p').getText().strip()
															
 
																+        if len(venue_text) > 0:
															
 
																+            # print("{}: {}".format(states[i-1], venue_text))
															
 
																+            this_a = cell.findAll('a')  # .get('href')
															
 
																+            for a in this_a:
															
 
																+                venue_name = a.getText().strip()
															
 
																+                this_venue = Venue(states[i - 1], venue_name)
															
 
																+                date_string = day
															
 
																+                this_url = a.get('href')
															
 
																+                scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
															
 
																+                scratchings_url = base_url + scratchings_url
															
 
																+                this_race_day = RaceDay(this_venue.state, this_venue.name, date_string, scratchings_url)
															
 
																+                all_race_days.append(this_race_day)
															
 
																+        i += 1
															
 
																+    return all_race_days
															
 
																+
															
 
																+
															
 
																+def get_meta_data(this_data):
															
 
																+    this_soup = BeautifulSoup(this_data, 'html.parser')
															
 
																+    this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
															
 
																+    last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
															
 
																+    close_regex = re.compile('Scratching close: (.+? AEST)')
															
 
																+    times = ['', 0, '', 0]
															
 
																+    time_format = '%a %d-%b-%y %I:%M%p'
															
 
																+    aest = timezone('Australia/Brisbane')
															
 
																+    if this_meta_data:
															
 
																+        this_meta_data = this_meta_data.getText()
															
 
																+        match = last_published_regex.search(this_meta_data)
															
 
																+        if match:
															
 
																+            print(match[1])
															
 
																+            times[0] = match[1][:-5]
															
 
																+            # times[0] = 'Thu 20-Jun-19 7:42AM'
															
 
																+            l_time = datetime.datetime.strptime(times[0], time_format)
															
 
																+            # print(aest.localize(l_time))
															
 
																+            times[1] = model.convert_to_unixtime(aest.localize(l_time))
															
 
																+            print(times[1])
															
 
																+        match = close_regex.search(this_meta_data)
															
 
																+        if match:
															
 
																+            print(match[1])
															
 
																+            times[2] = match[1][:-5]
															
 
																+            l_time = datetime.datetime.strptime(times[2], time_format)
															
 
																+            # print(aest.localize(l_time))
															
 
																+            times[3] = model.convert_to_unixtime(aest.localize(l_time))
															
 
																+            print(times[3])
															
 
																+    return times
															
 
																+    # print(this_meta_data)
															
--- a/_html.py
+++ b/_html.py
@@ -0,0 +1,12 @@
 
																+import requests
															
 
																+
															
 
																+
															
 
																+"""
															
 
																+This module contains methods to retrieve pages
															
 
																+"""
															
 
																+
															
 
																+
															
 
																+def get_page(this_url):
															
 
																+    # url = 'https://twitter.com/TheOnion'
															
 
																+    data = requests.get(this_url)
															
 
																+    return data.text
															
--- a/main.py
+++ b/main.py
@@ -0,0 +1,14 @@
 
																+import model
															
 
																+# import os
															
 
																+from pprint import pprint
															
 
																+
															
 
																+
															
 
																+if __name__ == '__main__':
															
 
																+    # current_timezone = os.environ['TZ']
															
 
																+
															
 
																+    venues = model.scrape_main_page()
															
 
																+    pprint(venues)
															
 
																+    for venue in venues:
															
 
																+        model.get_scratchings(venue)
															
 
																+    # Reset timezone to its original
															
 
																+    # os.environ['TZ'] = current_timezone
															
--- a/model.py
+++ b/model.py
@@ -0,0 +1,41 @@
 
																+import _html
															
 
																+import _bs
															
 
																+import pytz
															
 
																+import datetime
															
 
																+
															
 
																+
															
 
																+"""
															
 
																+Modules _html and _bs4 contain specialized methods.
															
 
																+"""
															
 
																+
															
 
																+local_timezones = {
															
 
																+    "NSW": "Australia/Sydney",
															
 
																+    "VIC": "Australia/Melbourne",
															
 
																+    "QLD": "Australia/Brisbane",
															
 
																+    "WA": "Australia/Perth",
															
 
																+    "SA": "Australia/Adelaide",
															
 
																+    "TAS": "Australia/Hobart",
															
 
																+    "ACT": "Australia/Sydney",
															
 
																+    "NT": "Australia/Darwin"}
															
 
																+
															
 
																+
															
 
																+def scrape_main_page():
															
 
																+    this_url = """https://racingaustralia.horse/Home.aspx"""
															
 
																+    this_data = _html.get_page(this_url)
															
 
																+    venues_all = _bs.get_today_row(this_data)
															
 
																+    return venues_all
															
 
																+
															
 
																+
															
 
																+def get_scratchings(this_venue):
															
 
																+    this_data = _html.get_page(this_venue[3])
															
 
																+    # print(this_data)
															
 
																+    meta_data = _bs.get_meta_data(this_data)
															
 
																+
															
 
																+
															
 
																+def convert_to_unixtime(dt_object):
															
 
																+    utc = pytz.UTC
															
 
																+    d = dt_object.astimezone(utc)
															
 
																+
															
 
																+    epoch = datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.UTC)
															
 
																+    ts = int((d - epoch).total_seconds())
															
 
																+    return ts
															
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,8 @@
 
																+beautifulsoup4==4.7.1
															
 
																+certifi==2019.6.16
															
 
																+chardet==3.0.4
															
 
																+idna==2.8
															
 
																+pytz==2019.1
															
 
																+requests==2.22.0
															
 
																+soupsieve==1.9.1
															
 
																+urllib3==1.25.3
															
--- a/venv/.gitignore
+++ b/venv/.gitignore
@@ -0,0 +1,126 @@
 
																+# Created by .ignore support plugin (hsz.mobi)
															
 
																+### Python template
															
 
																+# Byte-compiled / optimized / DLL files
															
 
																+__pycache__/
															
 
																+*.py[cod]
															
 
																+*$py.class
															
 
																+
															
 
																+# C extensions
															
 
																+*.so
															
 
																+
															
 
																+# Distribution / packaging
															
 
																+.Python
															
 
																+build/
															
 
																+develop-eggs/
															
 
																+dist/
															
 
																+downloads/
															
 
																+eggs/
															
 
																+.eggs/
															
 
																+lib/
															
 
																+lib64/
															
 
																+parts/
															
 
																+sdist/
															
 
																+var/
															
 
																+wheels/
															
 
																+pip-wheel-metadata/
															
 
																+share/python-wheels/
															
 
																+*.egg-info/
															
 
																+.installed.cfg
															
 
																+*.egg
															
 
																+MANIFEST
															
 
																+
															
 
																+# PyInstaller
															
 
																+#  Usually these files are written by a python script from a template
															
 
																+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
															
 
																+*.manifest
															
 
																+*.spec
															
 
																+
															
 
																+# Installer logs
															
 
																+pip-log.txt
															
 
																+pip-delete-this-directory.txt
															
 
																+
															
 
																+# Unit test / coverage reports
															
 
																+htmlcov/
															
 
																+.tox/
															
 
																+.nox/
															
 
																+.coverage
															
 
																+.coverage.*
															
 
																+.cache
															
 
																+nosetests.xml
															
 
																+coverage.xml
															
 
																+*.cover
															
 
																+.hypothesis/
															
 
																+.pytest_cache/
															
 
																+
															
 
																+# Translations
															
 
																+*.mo
															
 
																+*.pot
															
 
																+
															
 
																+# Django stuff:
															
 
																+*.log
															
 
																+local_settings.py
															
 
																+db.sqlite3
															
 
																+
															
 
																+# Flask stuff:
															
 
																+instance/
															
 
																+.webassets-cache
															
 
																+
															
 
																+# Scrapy stuff:
															
 
																+.scrapy
															
 
																+
															
 
																+# Sphinx documentation
															
 
																+docs/_build/
															
 
																+
															
 
																+# PyBuilder
															
 
																+target/
															
 
																+
															
 
																+# Jupyter Notebook
															
 
																+.ipynb_checkpoints
															
 
																+
															
 
																+# IPython
															
 
																+profile_default/
															
 
																+ipython_config.py
															
 
																+
															
 
																+# pyenv
															
 
																+.python-version
															
 
																+
															
 
																+# pipenv
															
 
																+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
															
 
																+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
															
 
																+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
															
 
																+#   install all needed dependencies.
															
 
																+#Pipfile.lock
															
 
																+
															
 
																+# celery beat schedule file
															
 
																+celerybeat-schedule
															
 
																+
															
 
																+# SageMath parsed files
															
 
																+*.sage.py
															
 
																+
															
 
																+# Environments
															
 
																+.env
															
 
																+.venv
															
 
																+env/
															
 
																+venv/
															
 
																+ENV/
															
 
																+env.bak/
															
 
																+venv.bak/
															
 
																+
															
 
																+# Spyder project settings
															
 
																+.spyderproject
															
 
																+.spyproject
															
 
																+
															
 
																+# Rope project settings
															
 
																+.ropeproject
															
 
																+
															
 
																+# mkdocs documentation
															
 
																+/site
															
 
																+
															
 
																+# mypy
															
 
																+.mypy_cache/
															
 
																+.dmypy.json
															
 
																+dmypy.json
															
 
																+
															
 
																+# Pyre type checker
															
 
																+.pyre/
															
 
																+