Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start of year update #473

Merged
merged 5 commits into from
Oct 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions server/scraper/resto/allergens.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ def parse_section_item(section_item: str) -> Union[dict[str, list[str]], None]:
"""
Parses strings of the form `food: allergen, allergen, allergen`
"""

if "soep van de dag" in section_item:
item_name = "Soep van de dag"
item_allergen_list = section_item
else:
item_name, item_allergen_list = section_item.split(":")
item_name, item_allergen_list = section_item.split(":", maxsplit=1)

# Sometimes a section will have extra info before the item list,
# this should not be parsed
Expand Down Expand Up @@ -73,7 +73,8 @@ def make_sections(
sections[section_header] = dict()
for raw_section_item in raw_section_items:
section_item = raw_section_item.get_text(strip=True)
assert section_item is not None
if not section_item:
continue

section_item_map = parse_section_item(section_item)
if section_item_map is None:
Expand Down
27 changes: 17 additions & 10 deletions server/scraper/resto/menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import json
import os
import re
import warnings

from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import string
import sys
import traceback
Expand All @@ -15,6 +16,8 @@

from pyquery import PyQuery as pq

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

# Bad python module system
sys.path.append('..')

Expand All @@ -30,16 +33,11 @@
"en": "https://www.ugent.be/en/facilities/restaurants/weekly-menu",
"nl": "https://www.ugent.be/student/nl/meer-dan-studeren/resto/weekmenu",
"nl-debrug-avond": "https://www.ugent.be/student/nl/meer-dan-studeren/resto/weekmenubrugavond",
"nl-coupure": "https://www.ugent.be/student/nl/meer-dan-studeren/resto/weekmenu",
"nl-dunant": "https://www.ugent.be/student/nl/meer-dan-studeren/resto/weekmenu",
"nl-merelbeke": "https://www.ugent.be/student/nl/meer-dan-studeren/resto/weekmenu",
}

NORMAL_WEEK = re.compile(r"week(\d+)$")
INDIVIDUAL_DAY_URL_OVERRIDE = {
"nl-coupure": r"week(\d+)coupure$",
"nl-dunant": r"week(\d+)(merelbekedunant|dunant)$",
"nl-merelbeke": r"week(\d+)(merelbekedunant|merelbeke)$",
"nl-debrug": r"week(\d+)brugsterre|week(27)duurzaam|week(28)duurzaam",
"nl-sterre": r"week(\d+)(brugsterre|sterre)|week(27)duurzaam",
"nl-ardoyen": r"week(\d+)ardoyen"
Expand All @@ -50,12 +48,9 @@
# which is very useful.
COPIED_ENDPOINTS = {
"nl-debrug": "nl",
"nl-heymans": "nl",
"nl-dunant": "nl",
"nl-coupure": "nl",
"nl-sterre": "nl",
"nl-ardoyen": "nl",
"nl-merelbeke": "nl",
}

# Day names to day of the week.
Expand Down Expand Up @@ -346,8 +341,20 @@ def get_day_menu(which, url, allergens: Dict[str, str]):

if HEADING_TO_TYPE[last_heading] == 'soup':
name, price = split_price(meal)
if "€" in name:
name, price_large = split_price(name)
else:
price_large = None
food_allergens = find_allergens_for_food(allergens, name)
soups.append(dict(price=price, name=name, type='side', allergens=food_allergens))
if price_large:
small = "klein" if "nl" in which else "small"
big = "groot" if "nl" in which else "big"
name_small = f"{name} {small}"
name_big = f"{name} {big}"
soups.append(dict(price=price, name=name_small, type='side', allergens=food_allergens))
soups.append(dict(price=price_large, name=name_big, type='side', allergens=food_allergens))
else:
soups.append(dict(price=price, name=name, type='side', allergens=food_allergens))
elif HEADING_TO_TYPE[last_heading] == 'meal soup':
name, price = split_price(meal)
food_allergens = find_allergens_for_food(allergens, name)
Expand Down
15 changes: 8 additions & 7 deletions server/scraper/resto/sandwiches.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import sys
sys.path.append('..')

from util import parse_money, write_json_to_file
from util import parse_money, write_json_to_file, split_price

SANDWICHES_URL = "https://www.ugent.be/student/nl/meer-dan-studeren/resto/broodjes/overzicht.htm"
HTML_PARSER = 'lxml'
Expand Down Expand Up @@ -116,8 +116,7 @@ def weekly_sandwiches(output, soup):
'start': start,
'end': end,
'name': columns[1].text.strip(),
'ingredients': parse_ingredients(columns[2].text),
'vegan': 'x' in columns[3].text
'ingredients': parse_ingredients(columns[2].text)
})

today = datetime.date.today()
Expand Down Expand Up @@ -166,15 +165,17 @@ def salad_bowls(output, soup):
"""
bowls = []

tables = soup.find_all('table', limit=3)
tables = soup.find_all('table', limit=4)

if len(tables) >= 3:
for row in soup.find_all('table', limit=3)[2].find_all("tr", class_=lambda x: x != 'tabelheader'):
if len(tables) >= 4:
header = soup.find('a', id="salad-bowls").parent
_, price = split_price(header.text) if header else (None, None)
for row in tables[3].find_all("tr", class_=lambda x: x != 'tabelheader'):
columns = row.find_all("td")
bowls.append({
'name': columns[0].text.strip(),
'description': columns[1].text.strip(),
'price': parse_money(columns[2].string) if columns[2].string else ""
'price': parse_money(price) if price else ""
})

output_file = os.path.join(output, SALADS)
Expand Down
7 changes: 6 additions & 1 deletion server/scraper/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,14 @@ def split_price(meal):
name = '-'.join(meal.split('-')[:-1]).strip()
name, price = move_junk_from_price_to_name(name, price)
return name, price
elif "/" in meal and "€" in meal:
price = meal.split('/')[-1].strip()
name = '/'.join(meal.split('/')[:-1]).strip()
name, price = move_junk_from_price_to_name(name, price)
return name, price
elif "€" in meal:
meal, price = meal.split("€")
return meal.strip(), price
return meal.strip(), price.strip()
else:
return meal.strip(), ""

Expand Down
Loading
Loading