Skip to content

Commit

Permalink
Added capacity scraper (#130)
Browse files Browse the repository at this point in the history
  • Loading branch information
vinnie4k authored Jan 8, 2024
1 parent fd95f2d commit 7684c3e
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 26 deletions.
19 changes: 12 additions & 7 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,24 +38,29 @@ def shutdown_session(exception=None):
db_session.remove()


# Scrape every 15 minutes
@scheduler.task("interval", id="scrape_sheets", seconds=900)
def scrape_sheets():
logging.info("Scraping from sheets...")
# Scrape hours every 15 minutes
@scheduler.task("interval", id="scrape_hours", seconds=900)
def scrape_hours():
logging.info("Scraping hours from sheets...")

# Fetch Hours
fetch_reg_facility()
fetch_reg_building()
fetch_sp_facility()

# Fetch Capacities

# Scrape capacities every 10 minutes
@scheduler.task("interval", id="scrape_capacities", seconds=600)
def scrape_capacities():
logging.info("Scraping capacities from C2C...")

fetch_capacities()


# Create database and fill it with data
init_db()
create_gym_table()
scrape_sheets()
scrape_hours()
scrape_capacities()

# Create schema.graphql
with open("schema.graphql", "w+") as schema_file:
Expand Down
60 changes: 41 additions & 19 deletions src/scrapers/capacities_scraper.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,53 @@
import gspread, pytz
import pytz, requests
from bs4 import BeautifulSoup
from collections import namedtuple
from datetime import datetime, timezone
from pandas import DataFrame
from src.database import db_session
from src.models.capacity import Capacity
from src.utils.constants import EASTERN_TIMEZONE, SERVICE_ACCOUNT_PATH, SHEET_CAPACITIES, SHEET_KEY
from src.utils.constants import (
C2C_URL,
CAPACITY_MARKER_COUNTS,
CAPACITY_MARKER_NAMES,
CAPACITY_MARKER_UPDATED,
CAPACITY_MARKER_PERCENT,
CAPACITY_MARKER_PERCENT_NA,
EASTERN_TIMEZONE,
)
from src.utils.utils import get_facility_id, unix_time

# Configure client and sheet
gc = gspread.service_account(filename=SERVICE_ACCOUNT_PATH)
sh = gc.open_by_key(SHEET_KEY)


def fetch_capacities():
"""
Fetch capacities for all facilities.
Fetch capacities for all facilities from Connect2Concepts.
"""
worksheet = sh.worksheet(SHEET_CAPACITIES)
vals = DataFrame(worksheet.get_all_records())
names = vals["Name"]

# Add to database
for i in range(len(names)):
count = int(vals["Count"][i])
percent = float(vals["Percent"][i])
updated = get_capacity_datetime(vals["Updated"][i])
facility_id = int(get_facility_id(names[i]))

headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0"}
html = requests.get(C2C_URL, headers=headers)
soup = BeautifulSoup(html.text, "html.parser")
data = soup.find_all("div", attrs={"class": "barChart"})

# For each div element
for facility_data in data:
# Grab capacity data
capacities = []
for val in facility_data.get_text("\n").split("\n"):
if val != "":
capacities.append(val.strip())

# Convert to named tuple
CapacityData = namedtuple("CapacityData", ["name", "count", "updated", "percent"])
capacity_data = CapacityData(*capacities)

# Parse data
facility_id = get_facility_id(CAPACITY_MARKER_NAMES[capacity_data.name])
count = int(capacity_data.count.replace(CAPACITY_MARKER_COUNTS, ""))
updated = get_capacity_datetime(capacity_data.updated.replace(CAPACITY_MARKER_UPDATED, ""))
percent = (
0.0
if capacity_data.percent == CAPACITY_MARKER_PERCENT_NA
else float(capacity_data.percent.replace(CAPACITY_MARKER_PERCENT, "")) / 100
)

# Add to sheets
add_single_capacity(count, facility_id, percent, updated)


Expand Down
27 changes: 27 additions & 0 deletions src/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,33 @@
# URL for Uplift image assets
ASSET_BASE_URL = "https://raw.githubusercontent.com/cuappdev/assets/master/uplift/"

# The path for capacities
C2C_URL = "https://connect2concepts.com/connect2/?type=bar&key=355de24d-d0e4-4262-ae97-bc0c78b92839&loc_status=false"

# The marker for counts in the HTML
CAPACITY_MARKER_COUNTS = "Last Count: "

# The marker for each facility name in the HTML
CAPACITY_MARKER_NAMES = {
"Helen Newman Fitness Center": "HNH Fitness Center",
"Noyes Fitness Center": "Noyes Fitness Center",
"Teagle Down Fitness Center": "Teagle Down Fitness Center",
"Teagle Up Fitness Center": "Teagle Up Fitness Center",
"Toni Morrison Fitness Center": "Morrison Fitness Center",
"HNH Court 1 Basketball": "HNH Court 1",
"HNH Court 2 Volleyball/Badminton": "HNH Court 2",
"Noyes Court Basketball": "Noyes Court",
}

# The marker for percent in the HTML
CAPACITY_MARKER_PERCENT = "%"

# The marker for missing percent in the HTML
CAPACITY_MARKER_PERCENT_NA = "NA"

# The marker for last updated in the HTML
CAPACITY_MARKER_UPDATED = "Updated: "

# Days of the week used in the spreadsheet
DAYS_OF_WEEK = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

Expand Down

0 comments on commit 7684c3e

Please sign in to comment.