From 804dd8175d8e13086b27ddd8050bda14f12041ea Mon Sep 17 00:00:00 2001 From: Belissimo_T Date: Tue, 21 Nov 2023 20:36:24 +0100 Subject: [PATCH 1/5] improve plan download --- backend/load_plans.py | 19 +++++++++++++++---- backend/plan_downloader.py | 37 ++++++++++++++++++++++--------------- backend/plan_processor.py | 2 +- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/backend/load_plans.py b/backend/load_plans.py index 7dfd442c..ed5115d6 100644 --- a/backend/load_plans.py +++ b/backend/load_plans.py @@ -30,10 +30,11 @@ async def check_infinite(self, interval: int = 60, *, once: bool = False, ignore try: downloaded_files = await self.plan_downloader.update_fetch() - self.plan_processor._logger.debug("* Processing plans...") - if downloaded_files: + self.plan_processor._logger.debug("* Processing plans...") self.plan_processor.meta_extractor.invalidate_cache() + else: + self.plan_processor._logger.debug("* No plans to process.") for (date, revision), downloaded_files_metadata in downloaded_files.items(): self.plan_processor.update_plans(date, revision) @@ -51,6 +52,7 @@ async def check_infinite(self, interval: int = 60, *, once: bool = False, ignore if once: break + self.plan_downloader._logger.debug(f"* Waiting {interval} s.") await asyncio.sleep(interval) @@ -68,10 +70,19 @@ async def get_crawlers(session: aiohttp.ClientSession | None = None, cache = Cache(Path(f".cache/{specifier}").absolute()) if create_clients: - data["hosting"]["creds"] = data["hosting"]["creds"].get("teachers", data["hosting"]["creds"].get("students")) + data["hosting"]["creds"] = data["hosting"]["creds"].get( + "teachers", data["hosting"]["creds"].get("students") + ) hosting = Hosting.deserialize(data["hosting"]) + client = IndiwareStundenplanerClient(hosting, session) + if hosting.creds is not None and hosting.creds.username == "schueler": + logger.warning("* Disabling room and teacher plans because only student creds are available.") + # avoid trying to fetch room and teacher plans if no creds are available + client.teacher_plan_client = None + client.room_plan_client = None + for plan_client in client.substitution_plan_clients: plan_client.proxy_provider = proxy_provider plan_client.no_delay = True @@ -103,7 +114,7 @@ async def main(): argument_parser.add_argument("--only-process", action="store_true", help="Do not download plans, only parse existing.") argument_parser.add_argument("--ignore-exceptions", action="store_true", - help="Don't raise exceptions and crash the program, instead print them and continue.") + help="Don't raise exceptions and crash the program, instead, print them and continue.") argument_parser.add_argument("--never-raise-out-of-proxies", action="store_true", help="Never crash the program if no more proxies seem to be available. " "Keep trying instead.") diff --git a/backend/plan_downloader.py b/backend/plan_downloader.py index 13d89988..dd84e652 100644 --- a/backend/plan_downloader.py +++ b/backend/plan_downloader.py @@ -15,8 +15,8 @@ from .cache import Cache - import stundenplan24_py.client + stundenplan24_py.client.set_min_delay_between_requests(0.2) @@ -72,11 +72,18 @@ async def update_fetch(self) -> dict[tuple[datetime.date, datetime.datetime], li new: set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]] = set() - # not using asyncio.gather because the logs would be confusing - for indiware_client in self.client.indiware_mobil_clients: - new |= await self.fetch_indiware_mobil(indiware_client) + fetched = await asyncio.gather( + *(self.fetch_indiware_mobil(indiware_client) for indiware_client in self.client.indiware_mobil_clients), + self.fetch_substitution_plans() + ) - new |= await self.fetch_substitution_plans() + for fetched_set in fetched: + new |= fetched_set + + self.cache.store_meta_file( + json.dumps({"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat()}), + "last_fetch.json" + ) out: dict[tuple[datetime.date, datetime.datetime], list[PlanFileMetadata]] = defaultdict(list) for date, revision, file_metadata in new: @@ -85,8 +92,8 @@ async def update_fetch(self) -> dict[tuple[datetime.date, datetime.datetime], li return out async def fetch_indiware_mobil( - self, - indiware_client: IndiwareMobilClient + self, + indiware_client: IndiwareMobilClient ) -> set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]]: try: self._logger.debug(f"=> Fetching Indiware Mobil available files.") @@ -102,9 +109,9 @@ async def fetch_indiware_mobil( return await self.download_indiware_mobil(indiware_client, plan_files) async def download_indiware_mobil( - self, - client: IndiwareMobilClient, - downloadable_plan_files: dict[str, datetime.datetime] + self, + client: IndiwareMobilClient, + downloadable_plan_files: dict[str, datetime.datetime] ) -> set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]]: new: set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]] = set() @@ -149,8 +156,8 @@ async def fetch_substitution_plans(self) -> set[tuple[datetime.date, datetime.da return out async def fetch_substitution_plan( - self, - plan_client: SubstitutionPlanClient + self, + plan_client: SubstitutionPlanClient ) -> set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]]: self._logger.debug("=> Checking for new substitution plans...") @@ -197,9 +204,9 @@ def valid_date_iterator(start: datetime.date, step: int = 1): return out async def download_substitution_plan( - self, - plan_client: SubstitutionPlanClient, - date: datetime.date + self, + plan_client: SubstitutionPlanClient, + date: datetime.date ) -> set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]]: if isinstance(plan_client.endpoint, StudentsSubstitutionPlanEndpoint): plan_filename = "VplanKl.xml" diff --git a/backend/plan_processor.py b/backend/plan_processor.py index dc0eecd6..1ea349c0 100644 --- a/backend/plan_processor.py +++ b/backend/plan_processor.py @@ -28,7 +28,7 @@ def __init__(self, cache: Cache, school_number: str, *, logger: logging.Logger): self.load_teachers() def load_teachers(self): - self._logger.info("* Loading cached teachers...") + self._logger.debug("* Loading cached teachers...") try: data = json.loads(self.cache.get_meta_file("teachers.json")) except FileNotFoundError: From 62dcea2163cffac6eb6d41c5462c72a42cf1db1f Mon Sep 17 00:00:00 2001 From: Belissimo_T Date: Wed, 22 Nov 2023 20:03:16 +0100 Subject: [PATCH 2/5] add db event logging --- backend/events.py | 157 +++++++++++++++++++++++++++++++++++++ backend/load_plans.py | 22 +++++- backend/plan_downloader.py | 21 +++-- backend/plan_processor.py | 51 +++++++++--- 4 files changed, 230 insertions(+), 21 deletions(-) create mode 100644 backend/events.py diff --git a/backend/events.py b/backend/events.py new file mode 100644 index 00000000..4d1c653e --- /dev/null +++ b/backend/events.py @@ -0,0 +1,157 @@ +import asyncio +import dataclasses +import datetime +import logging +import time +import typing + +import dotenv.main as dotenv +import pymongo + + +@dataclasses.dataclass +class Event: + school_number: str + start_time: datetime.datetime + end_time: datetime.datetime + + def get_base_dict(self): + assert self.start_time.tzinfo is not None is not self.end_time.tzinfo + return { + "school_number": self.school_number, + "start_time": self.start_time.isoformat(), + "end_time": self.end_time.isoformat() + } + + +@dataclasses.dataclass +class PlanDownload(Event): + plan_type: str # Ex: "PlanKl.xml" or "VPlanKl.xml" + last_modified: datetime.datetime + file_length: int + + +@dataclasses.dataclass +class AllPlansDownloaded(Event): + pass + + +@dataclasses.dataclass +class PlanCrawlCycle(Event): + pass + + +@dataclasses.dataclass +class RevisionProcessed(Event): + version: str + date: datetime.date + revision: datetime.datetime + + +@dataclasses.dataclass +class StudentsRevisionProcessed(RevisionProcessed): + has_vplan: bool | None + + +@dataclasses.dataclass +class TeachersRevisionProcessed(RevisionProcessed): + pass + + +@dataclasses.dataclass +class TeacherScrape(Event): + teacher_count: int + + +@dataclasses.dataclass +class MetaUpdate(Event): + pass + + +_T = typing.TypeVar("_T", bound=Event) + + +class Timer(typing.Generic[_T]): + start: datetime.datetime + end: datetime.datetime + + def __init__(self, school_number: str, event_type: typing.Type[_T]): + self.school_number = school_number + self.event_type = event_type + + def __enter__(self): + self.start = now() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.end = now() + + def construct(self, **kwargs) -> _T: + return self.event_type( + school_number=self.school_number, + start_time=self.start, + end_time=self.end, + **kwargs + ) + + async def submit_async(self, **kwargs): + await submit_event_async(self.construct(**kwargs)) + + def submit(self, **kwargs): + return _submit_event(self.construct(**kwargs)) + + +def _submit_event(event: Event): + if _EVENTS_COLLECTION is None: + logging.debug("No MongoDB collection found. Not submitting event.") + return + + event_base_dict = event.get_base_dict() + + out = {} + for key, value in dataclasses.asdict(event).items(): + if key in event_base_dict: + continue + + if isinstance(value, datetime.datetime): + out[key] = value.isoformat() + else: + out[key] = value + + entity = { + **event_base_dict, + "type": event.__class__.__name__, + "data": out + } + + _EVENTS_COLLECTION.insert_one(entity) + + +async def submit_event_async(event: Event): + await asyncio.get_event_loop().run_in_executor(None, _submit_event, event) + + +def submit_event(event: Event): + return asyncio.get_event_loop().create_task(submit_event_async(event)) + + +def now() -> datetime.datetime: + return datetime.datetime.now(datetime.timezone.utc) + + +def get_mongodb_event_collection() -> pymongo.collection.Collection | None: + env = dotenv.DotEnv(dotenv_path=dotenv.find_dotenv()) + + if not env.get("PRODUCTION"): + logging.warning("Not in production mode. Not submitting events.") + return + + if (mongo_uri := env.get("MONGO_URL")) is not None: + collection = pymongo.MongoClient(mongo_uri).get_database("vplan").get_collection("events") + logging.info("Event collection found.") + return collection + else: + logging.warning("No MONGO_URI found in .env file.") + + +_EVENTS_COLLECTION = get_mongodb_event_collection() diff --git a/backend/load_plans.py b/backend/load_plans.py index ed5115d6..18dbd084 100644 --- a/backend/load_plans.py +++ b/backend/load_plans.py @@ -11,6 +11,7 @@ IndiwareStundenplanerClient, Hosting, proxies ) +from . import events from .creds_provider import creds_provider_factory from .plan_downloader import PlanDownloader from .cache import Cache @@ -18,7 +19,10 @@ class PlanCrawler: - def __init__(self, plan_downloader: PlanDownloader, plan_processor: PlanProcessor): + school_number: str + + def __init__(self, school_number: str, plan_downloader: PlanDownloader, plan_processor: PlanProcessor): + self.school_number = school_number self.plan_downloader = plan_downloader self.plan_processor = plan_processor @@ -27,6 +31,7 @@ async def check_infinite(self, interval: int = 60, *, once: bool = False, ignore self.plan_processor.update_all() while True: + _t1 = events.now() try: downloaded_files = await self.plan_downloader.update_fetch() @@ -48,6 +53,15 @@ async def check_infinite(self, interval: int = 60, *, once: bool = False, ignore raise else: self.plan_processor._logger.error("An error occurred.", exc_info=e) + else: + _t2 = events.now() + await events.submit_event_async( + events.PlanCrawlCycle( + school_number=self.school_number, + start_time=_t1, + end_time=_t2, + ) + ) if once: break @@ -93,11 +107,11 @@ async def get_crawlers(session: aiohttp.ClientSession | None = None, else: client = None - plan_downloader = PlanDownloader(client, cache, logger=logger) + plan_downloader = PlanDownloader(specifier, client, cache, logger=logger) plan_processor = PlanProcessor(cache, specifier, logger=logger) # create crawler - p = PlanCrawler(plan_downloader, plan_processor) + p = PlanCrawler(specifier, plan_downloader, plan_processor) crawlers[school_name] = p @@ -128,7 +142,7 @@ async def main(): pass logging.basicConfig(level=args.loglevel, format="[%(asctime)s] [%(levelname)8s] %(name)s: %(message)s", - datefmt="%Y-%m-%d %H:%M:%S") + datefmt="%Y-%m-%d %H:%M:%S", force=True) proxy_provider = proxies.ProxyProvider(Path("proxies.json").absolute(), never_raise_out_of_proxies=args.never_raise_out_of_proxies) diff --git a/backend/plan_downloader.py b/backend/plan_downloader.py index dd84e652..06a34923 100644 --- a/backend/plan_downloader.py +++ b/backend/plan_downloader.py @@ -12,6 +12,7 @@ IndiwareStundenplanerClient, IndiwareMobilClient, PlanClientError, SubstitutionPlanClient, UnauthorizedError, substitution_plan, PlanNotFoundError, StudentsSubstitutionPlanEndpoint, TeachersSubstitutionPlanEndpoint ) +from . import events from .cache import Cache @@ -45,9 +46,11 @@ def deserialize(cls, data: dict) -> PlanFileMetadata: class PlanDownloader: """Check for new indiware plans in regular intervals store them in cache.""" - def __init__(self, client: IndiwareStundenplanerClient, cache: Cache, *, logger: logging.Logger): + def __init__(self, school_number: str, client: IndiwareStundenplanerClient, cache: Cache, *, + logger: logging.Logger): self._logger = logger + self.school_number = school_number self.client = client self.cache = cache @@ -72,10 +75,12 @@ async def update_fetch(self) -> dict[tuple[datetime.date, datetime.datetime], li new: set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]] = set() - fetched = await asyncio.gather( - *(self.fetch_indiware_mobil(indiware_client) for indiware_client in self.client.indiware_mobil_clients), - self.fetch_substitution_plans() - ) + with events.Timer(self.school_number, events.AllPlansDownloaded) as timer: + fetched = await asyncio.gather( + *(self.fetch_indiware_mobil(indiware_client) for indiware_client in self.client.indiware_mobil_clients), + self.fetch_substitution_plans() + ) + await timer.submit_async() for fetched_set in fetched: new |= fetched_set @@ -132,7 +137,8 @@ async def download_indiware_mobil( else: self._logger.info(f" -> Downloading indiware {filename!r}. Revision: {revision!s}.") - plan_response = await client.fetch_plan(filename) + with events.Timer(self.school_number, events.PlanDownload) as timer: + plan_response = await client.fetch_plan(filename) assert plan_response.last_modified is not None downloaded_file = PlanFileMetadata( @@ -141,6 +147,9 @@ async def download_indiware_mobil( etag=plan_response.etag, ) + await timer.submit_async(plan_type=plan_filename, last_modified=plan_response.last_modified, + file_length=len(plan_response.content)) + self.cache.store_plan_file(date, revision, plan_response.content, plan_filename) self.cache.store_plan_file(date, revision, json.dumps(downloaded_file.serialize()), plan_filename + ".json") diff --git a/backend/plan_processor.py b/backend/plan_processor.py index 1ea349c0..71708c3a 100644 --- a/backend/plan_processor.py +++ b/backend/plan_processor.py @@ -4,7 +4,7 @@ import json import logging -from . import schools, default_plan +from . import schools, default_plan, events from .cache import Cache from .meta_extractor import MetaExtractor from .teacher import Teachers @@ -73,6 +73,7 @@ def compute_plans(self, date: datetime.date, timestamp: datetime.datetime): except FileNotFoundError: self._logger.warning(f"=> Could not find Indiware form plan for date {date!s} and timestamp {timestamp!s}.") else: + _t1 = events.now() try: vplan_kl = self.cache.get_plan_file(date, timestamp, "VplanKl.xml", newest_before=True) except FileNotFoundError: @@ -152,11 +153,23 @@ def compute_plans(self, date: datetime.date, timestamp: datetime.datetime): "info.json" ) + _t2 = events.now() + events.submit_event(events.StudentsRevisionProcessed( + school_number=self.school_number, + start_time=_t1, + end_time=_t2, + version=self.VERSION, + date=date, + revision=timestamp, + has_vplan=vplan_kl is not None + )) + try: plan_le = self.cache.get_plan_file(date, timestamp, "PlanLe.xml", newest_before=True) except FileNotFoundError: pass else: + _t1 = events.now() try: plan_ra = self.cache.get_plan_file(date, timestamp, "PlanRa.xml", newest_before=True) except FileNotFoundError: @@ -201,6 +214,16 @@ def compute_plans(self, date: datetime.date, timestamp: datetime.datetime): "rooms.teachers.json" ) + _t2 = events.now() + events.submit_event(events.TeachersRevisionProcessed( + school_number=self.school_number, + start_time=_t1, + end_time=_t2, + version=self.VERSION, + date=date, + revision=timestamp + )) + self.cache.update_newest(date) self.cache.store_plan_file(date, timestamp, str(self.VERSION), ".processed") @@ -212,16 +235,19 @@ def update_meta(self): self._logger.info("=> No plans cached yet.") return - data = { - "free_days": [date.isoformat() for date in self.meta_extractor.free_days()] - } - self.cache.store_meta_file(json.dumps(data), "meta.json") - self.cache.store_meta_file(json.dumps(self.meta_extractor.dates_data()), "dates.json") + with events.Timer(self.school_number, events.MetaUpdate) as timer: + data = { + "free_days": [date.isoformat() for date in self.meta_extractor.free_days()] + } + self.cache.store_meta_file(json.dumps(data), "meta.json") + self.cache.store_meta_file(json.dumps(self.meta_extractor.dates_data()), "dates.json") + + self.teachers.add_teachers(*self.meta_extractor.teachers()) + self.scrape_teachers() + self.update_forms() + self.update_rooms() - self.teachers.add_teachers(*self.meta_extractor.teachers()) - self.scrape_teachers() - self.update_forms() - self.update_rooms() + timer.submit() def scrape_teachers(self): if datetime.datetime.now() - self.teachers.scrape_timestamp < datetime.timedelta(hours=6): @@ -236,10 +262,13 @@ def scrape_teachers(self): else: self._logger.info("=> Scraping teachers...") try: - scraped_teachers = schools.teacher_scrapers[str(self.school_number)]() + with events.Timer(self.school_number, events.TeacherScrape) as timer: + scraped_teachers = schools.teacher_scrapers[str(self.school_number)]() except Exception as e: self._logger.error(" -> Exception while scraping teachers.", exc_info=e) scraped_teachers = [] + else: + timer.submit(teacher_count=len(scraped_teachers)) self._logger.debug(f" -> Found {len(scraped_teachers)} teachers.") From 3e4dbc4e3257598893c9d2e8ceec137950a8f2ce Mon Sep 17 00:00:00 2001 From: Belissimo_T Date: Wed, 22 Nov 2023 20:03:27 +0100 Subject: [PATCH 3/5] fix type annotation --- backend/vplan_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/vplan_utils.py b/backend/vplan_utils.py index abb3aee4..5fdd96ee 100644 --- a/backend/vplan_utils.py +++ b/backend/vplan_utils.py @@ -255,7 +255,7 @@ def parse_absent_element(element: str) -> tuple[str, set[int]]: return label, periods -def find_closest_date(dates) -> datetime.datetime | None: +def find_closest_date(dates) -> datetime.date | None: now = datetime.datetime.now() today = now.date() future_dates = [d for d in dates if d > today] From b5777004825fd0420497ec0abdab8bcbec191b10 Mon Sep 17 00:00:00 2001 From: Belissimo_T Date: Wed, 22 Nov 2023 20:03:53 +0100 Subject: [PATCH 4/5] refactor update_rooms() --- backend/plan_processor.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/backend/plan_processor.py b/backend/plan_processor.py index 71708c3a..3d35d260 100644 --- a/backend/plan_processor.py +++ b/backend/plan_processor.py @@ -300,28 +300,20 @@ def update_forms(self): def update_rooms(self): self._logger.info("* Updating rooms...") - all_rooms = self.meta_extractor.rooms() - parsed_rooms: dict[str, dict] = {} + parsed_rooms: dict[str, dict] = {room: None for room in self.meta_extractor.rooms()} try: room_parser = schools.room_parsers[str(self.school_number)] - - for room in all_rooms: + except KeyError: + self._logger.debug("=> No room parser available for this school.") + else: + for room in parsed_rooms: try: parsed_rooms[room] = room_parser(room).to_dict() except Exception as e: - self._logger.error(f" -> Error while parsing room {room!r}: {e}") - - except KeyError: - self._logger.debug("=> No room parser available for this school.") - - parsed_rooms = {room: None for room in all_rooms} - - data = { - room: parsed_rooms.get(room) for room in all_rooms - } + self._logger.error(f" -> Error while parsing room {room!r}.", exc_info=e) self.cache.store_meta_file( - json.dumps(data), + json.dumps(parsed_rooms), "rooms.json" ) From 04c8cfc77d06042276b31a56597cbc24b552a9c1 Mon Sep 17 00:00:00 2001 From: Belissimo_T Date: Wed, 22 Nov 2023 20:05:49 +0100 Subject: [PATCH 5/5] fix 804dd817 --- backend/load_plans.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/load_plans.py b/backend/load_plans.py index 18dbd084..6da34834 100644 --- a/backend/load_plans.py +++ b/backend/load_plans.py @@ -96,6 +96,7 @@ async def get_crawlers(session: aiohttp.ClientSession | None = None, # avoid trying to fetch room and teacher plans if no creds are available client.teacher_plan_client = None client.room_plan_client = None + client.teachers_substitution_plan_client = None for plan_client in client.substitution_plan_clients: plan_client.proxy_provider = proxy_provider