Browse Source

Initial commit

Foppe Hemminga 6 năm trước cách đây
commit
e84d7881ee
11 tập tin đã thay đổi với 327 bổ sung0 xóa
  1. 10 0
      .idea/dictionaries/foppe.xml
  2. 10 0
      .idea/misc.xml
  3. 8 0
      .idea/modules.xml
  4. 13 0
      .idea/scratchings.iml
  5. 6 0
      .idea/vcs.xml
  6. 79 0
      _bs.py
  7. 12 0
      _html.py
  8. 14 0
      main.py
  9. 41 0
      model.py
  10. 8 0
      requirements.txt
  11. 126 0
      venv/.gitignore

+ 10 - 0
.idea/dictionaries/foppe.xml

@@ -0,0 +1,10 @@
+<component name="ProjectDictionaryState">
+  <dictionary name="foppe">
+    <words>
+      <w>aest</w>
+      <w>beautifulsoup</w>
+      <w>pytz</w>
+      <w>unixtime</w>
+    </words>
+  </dictionary>
+</component>

+ 10 - 0
.idea/misc.xml

@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="NodePackageJsonFileManager">
+    <packageJsonPaths />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (scratchings)" project-jdk-type="Python SDK" />
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/scratchings.iml" filepath="$PROJECT_DIR$/.idea/scratchings.iml" />
+    </modules>
+  </component>
+</project>

+ 13 - 0
.idea/scratchings.iml

@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>

+ 6 - 0
.idea/vcs.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

+ 79 - 0
_bs.py

@@ -0,0 +1,79 @@
+from bs4 import BeautifulSoup
+import re
+import datetime
+from pytz import timezone
+import model
+import collections
+# import pytz
+
+
+"""
+This module contains custom methods based on bs4.beautifulsoup to analyze data
+"""
+
+base_url = 'https://racingaustralia.horse/FreeFields/'
+Venue = collections.namedtuple('Venue', 'state, name')
+RaceDay = collections.namedtuple('RaceDay', Venue._fields + ('date_string', 'scratchings_url'))
+
+
+def get_today_row(this_text):
+    this_soup = BeautifulSoup(this_text, 'html.parser')
+    rows = this_soup.select('tr.rows')
+    my_row = rows[2]
+    cells = my_row.select('td')
+    i = 0
+    states = ('NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'ACT', 'NT')
+    all_race_days = []
+    day = 'Unknown'
+    for cell in cells:
+        if i == 0:
+            day = cell.find('span').getText()
+            # print("date: {}".format(day))
+            i += 1
+            continue
+        venue_text = cell.find('p').getText().strip()
+        if len(venue_text) > 0:
+            # print("{}: {}".format(states[i-1], venue_text))
+            this_a = cell.findAll('a')  # .get('href')
+            for a in this_a:
+                venue_name = a.getText().strip()
+                this_venue = Venue(states[i - 1], venue_name)
+                date_string = day
+                this_url = a.get('href')
+                scratchings_url = re.sub(r"/(.*)\.aspx", 'Scratchings.aspx', this_url)
+                scratchings_url = base_url + scratchings_url
+                this_race_day = RaceDay(this_venue.state, this_venue.name, date_string, scratchings_url)
+                all_race_days.append(this_race_day)
+        i += 1
+    return all_race_days
+
+
+def get_meta_data(this_data):
+    this_soup = BeautifulSoup(this_data, 'html.parser')
+    this_meta_data = this_soup.select('div.race-venue-bottom')[0].select('div.col2')[0]
+    last_published_regex = re.compile('Scratchings Last Published: (.+? AEST)')
+    close_regex = re.compile('Scratching close: (.+? AEST)')
+    times = ['', 0, '', 0]
+    time_format = '%a %d-%b-%y %I:%M%p'
+    aest = timezone('Australia/Brisbane')
+    if this_meta_data:
+        this_meta_data = this_meta_data.getText()
+        match = last_published_regex.search(this_meta_data)
+        if match:
+            print(match[1])
+            times[0] = match[1][:-5]
+            # times[0] = 'Thu 20-Jun-19 7:42AM'
+            l_time = datetime.datetime.strptime(times[0], time_format)
+            # print(aest.localize(l_time))
+            times[1] = model.convert_to_unixtime(aest.localize(l_time))
+            print(times[1])
+        match = close_regex.search(this_meta_data)
+        if match:
+            print(match[1])
+            times[2] = match[1][:-5]
+            l_time = datetime.datetime.strptime(times[2], time_format)
+            # print(aest.localize(l_time))
+            times[3] = model.convert_to_unixtime(aest.localize(l_time))
+            print(times[3])
+    return times
+    # print(this_meta_data)

+ 12 - 0
_html.py

@@ -0,0 +1,12 @@
+import requests
+
+
+"""
+This module contains methods to retrieve pages
+"""
+
+
+def get_page(this_url):
+    # url = 'https://twitter.com/TheOnion'
+    data = requests.get(this_url)
+    return data.text

+ 14 - 0
main.py

@@ -0,0 +1,14 @@
+import model
+# import os
+from pprint import pprint
+
+
+if __name__ == '__main__':
+    # current_timezone = os.environ['TZ']
+
+    venues = model.scrape_main_page()
+    pprint(venues)
+    for venue in venues:
+        model.get_scratchings(venue)
+    # Reset timezone to its original
+    # os.environ['TZ'] = current_timezone

+ 41 - 0
model.py

@@ -0,0 +1,41 @@
+import _html
+import _bs
+import pytz
+import datetime
+
+
+"""
+Modules _html and _bs4 contain specialized methods.
+"""
+
+local_timezones = {
+    "NSW": "Australia/Sydney",
+    "VIC": "Australia/Melbourne",
+    "QLD": "Australia/Brisbane",
+    "WA": "Australia/Perth",
+    "SA": "Australia/Adelaide",
+    "TAS": "Australia/Hobart",
+    "ACT": "Australia/Sydney",
+    "NT": "Australia/Darwin"}
+
+
+def scrape_main_page():
+    this_url = """https://racingaustralia.horse/Home.aspx"""
+    this_data = _html.get_page(this_url)
+    venues_all = _bs.get_today_row(this_data)
+    return venues_all
+
+
+def get_scratchings(this_venue):
+    this_data = _html.get_page(this_venue[3])
+    # print(this_data)
+    meta_data = _bs.get_meta_data(this_data)
+
+
+def convert_to_unixtime(dt_object):
+    utc = pytz.UTC
+    d = dt_object.astimezone(utc)
+
+    epoch = datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.UTC)
+    ts = int((d - epoch).total_seconds())
+    return ts

+ 8 - 0
requirements.txt

@@ -0,0 +1,8 @@
+beautifulsoup4==4.7.1
+certifi==2019.6.16
+chardet==3.0.4
+idna==2.8
+pytz==2019.1
+requests==2.22.0
+soupsieve==1.9.1
+urllib3==1.25.3

+ 126 - 0
venv/.gitignore

@@ -0,0 +1,126 @@
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+