Improve allergen scraping

ZeusWPI · Jan 24, 2024 · 0f10162 · 0f10162
1 parent 8c57deb
commit 0f10162
Showing 1 changed file with 10 additions and 3 deletions.
diff --git a/server/scraper/resto/allergens.py b/server/scraper/resto/allergens.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import argparse
+import itertools
 import os
 import sys
 from typing import Union
@@ -16,7 +17,10 @@
 URL = "https://www.ugent.be/student/nl/meer-dan-studeren/resto/allergenen"
 SKIPPED_ELEMENTS = [
     "vegetarisch",
-    "vegan"
+    "vegan",
+    "veggie",
+    "msc",
+    "asc"
 ]
 
 
@@ -33,18 +37,21 @@ def parse_section_item(section_item: str) -> Union[dict[str, list[str]], None]:
         item_name = "Soep van de dag"
         item_allergen_list = section_item
     else:
-        item_name, item_allergen_list = section_item.split(":", maxsplit=1)
+        item_name, item_allergen_list = section_item.rsplit(":", maxsplit=1)
 
     # Sometimes a section will have extra info before the item list,
     # this should not be parsed
     if item_allergen_list == "":
         return None
 
     item_allergens = list(map(lambda a: a.strip(), item_allergen_list.split(",")))
+    # Split items with "-"
+    item_allergens = list(itertools.chain.from_iterable(item.split("-") for item in item_allergens))
+    item_allergens = [x.strip().strip(".") for x in item_allergens]
 
     # Exclude last item, it is not an allergen but a diet name
     # eg. 'Vegetarian' or 'Vegan'
-    return {item_name.lower(): sorted({x.strip(".") for x in item_allergens if x.strip(".") not in SKIPPED_ELEMENTS})}
+    return {item_name.lower(): sorted({x for x in item_allergens if x not in SKIPPED_ELEMENTS})}
 
 
 def make_sections(