Skip to content

Commit

Permalink
Merge pull request #241 from ArtrenH/main
Browse files Browse the repository at this point in the history
2023-11-22
  • Loading branch information
Belissimo-T authored Nov 22, 2023
2 parents ea3a624 + 04c8cfc commit 79733c2
Show file tree
Hide file tree
Showing 5 changed files with 273 additions and 53 deletions.
157 changes: 157 additions & 0 deletions backend/events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import asyncio
import dataclasses
import datetime
import logging
import time
import typing

import dotenv.main as dotenv
import pymongo


@dataclasses.dataclass
class Event:
school_number: str
start_time: datetime.datetime
end_time: datetime.datetime

def get_base_dict(self):
assert self.start_time.tzinfo is not None is not self.end_time.tzinfo
return {
"school_number": self.school_number,
"start_time": self.start_time.isoformat(),
"end_time": self.end_time.isoformat()
}


@dataclasses.dataclass
class PlanDownload(Event):
plan_type: str # Ex: "PlanKl.xml" or "VPlanKl.xml"
last_modified: datetime.datetime
file_length: int


@dataclasses.dataclass
class AllPlansDownloaded(Event):
pass


@dataclasses.dataclass
class PlanCrawlCycle(Event):
pass


@dataclasses.dataclass
class RevisionProcessed(Event):
version: str
date: datetime.date
revision: datetime.datetime


@dataclasses.dataclass
class StudentsRevisionProcessed(RevisionProcessed):
has_vplan: bool | None


@dataclasses.dataclass
class TeachersRevisionProcessed(RevisionProcessed):
pass


@dataclasses.dataclass
class TeacherScrape(Event):
teacher_count: int


@dataclasses.dataclass
class MetaUpdate(Event):
pass


_T = typing.TypeVar("_T", bound=Event)


class Timer(typing.Generic[_T]):
start: datetime.datetime
end: datetime.datetime

def __init__(self, school_number: str, event_type: typing.Type[_T]):
self.school_number = school_number
self.event_type = event_type

def __enter__(self):
self.start = now()
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.end = now()

def construct(self, **kwargs) -> _T:
return self.event_type(
school_number=self.school_number,
start_time=self.start,
end_time=self.end,
**kwargs
)

async def submit_async(self, **kwargs):
await submit_event_async(self.construct(**kwargs))

def submit(self, **kwargs):
return _submit_event(self.construct(**kwargs))


def _submit_event(event: Event):
if _EVENTS_COLLECTION is None:
logging.debug("No MongoDB collection found. Not submitting event.")
return

event_base_dict = event.get_base_dict()

out = {}
for key, value in dataclasses.asdict(event).items():
if key in event_base_dict:
continue

if isinstance(value, datetime.datetime):
out[key] = value.isoformat()
else:
out[key] = value

entity = {
**event_base_dict,
"type": event.__class__.__name__,
"data": out
}

_EVENTS_COLLECTION.insert_one(entity)


async def submit_event_async(event: Event):
await asyncio.get_event_loop().run_in_executor(None, _submit_event, event)


def submit_event(event: Event):
return asyncio.get_event_loop().create_task(submit_event_async(event))


def now() -> datetime.datetime:
return datetime.datetime.now(datetime.timezone.utc)


def get_mongodb_event_collection() -> pymongo.collection.Collection | None:
env = dotenv.DotEnv(dotenv_path=dotenv.find_dotenv())

if not env.get("PRODUCTION"):
logging.warning("Not in production mode. Not submitting events.")
return

if (mongo_uri := env.get("MONGO_URL")) is not None:
collection = pymongo.MongoClient(mongo_uri).get_database("vplan").get_collection("events")
logging.info("Event collection found.")
return collection
else:
logging.warning("No MONGO_URI found in .env file.")


_EVENTS_COLLECTION = get_mongodb_event_collection()
42 changes: 34 additions & 8 deletions backend/load_plans.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,18 @@
IndiwareStundenplanerClient, Hosting, proxies
)

from . import events
from .creds_provider import creds_provider_factory
from .plan_downloader import PlanDownloader
from .cache import Cache
from .plan_processor import PlanProcessor


class PlanCrawler:
def __init__(self, plan_downloader: PlanDownloader, plan_processor: PlanProcessor):
school_number: str

def __init__(self, school_number: str, plan_downloader: PlanDownloader, plan_processor: PlanProcessor):
self.school_number = school_number
self.plan_downloader = plan_downloader
self.plan_processor = plan_processor

Expand All @@ -27,13 +31,15 @@ async def check_infinite(self, interval: int = 60, *, once: bool = False, ignore
self.plan_processor.update_all()

while True:
_t1 = events.now()
try:
downloaded_files = await self.plan_downloader.update_fetch()

self.plan_processor._logger.debug("* Processing plans...")

if downloaded_files:
self.plan_processor._logger.debug("* Processing plans...")
self.plan_processor.meta_extractor.invalidate_cache()
else:
self.plan_processor._logger.debug("* No plans to process.")

for (date, revision), downloaded_files_metadata in downloaded_files.items():
self.plan_processor.update_plans(date, revision)
Expand All @@ -47,10 +53,20 @@ async def check_infinite(self, interval: int = 60, *, once: bool = False, ignore
raise
else:
self.plan_processor._logger.error("An error occurred.", exc_info=e)
else:
_t2 = events.now()
await events.submit_event_async(
events.PlanCrawlCycle(
school_number=self.school_number,
start_time=_t1,
end_time=_t2,
)
)

if once:
break

self.plan_downloader._logger.debug(f"* Waiting {interval} s.")
await asyncio.sleep(interval)


Expand All @@ -68,10 +84,20 @@ async def get_crawlers(session: aiohttp.ClientSession | None = None,
cache = Cache(Path(f".cache/{specifier}").absolute())

if create_clients:
data["hosting"]["creds"] = data["hosting"]["creds"].get("teachers", data["hosting"]["creds"].get("students"))
data["hosting"]["creds"] = data["hosting"]["creds"].get(
"teachers", data["hosting"]["creds"].get("students")
)
hosting = Hosting.deserialize(data["hosting"])

client = IndiwareStundenplanerClient(hosting, session)

if hosting.creds is not None and hosting.creds.username == "schueler":
logger.warning("* Disabling room and teacher plans because only student creds are available.")
# avoid trying to fetch room and teacher plans if no creds are available
client.teacher_plan_client = None
client.room_plan_client = None
client.teachers_substitution_plan_client = None

for plan_client in client.substitution_plan_clients:
plan_client.proxy_provider = proxy_provider
plan_client.no_delay = True
Expand All @@ -82,11 +108,11 @@ async def get_crawlers(session: aiohttp.ClientSession | None = None,
else:
client = None

plan_downloader = PlanDownloader(client, cache, logger=logger)
plan_downloader = PlanDownloader(specifier, client, cache, logger=logger)
plan_processor = PlanProcessor(cache, specifier, logger=logger)

# create crawler
p = PlanCrawler(plan_downloader, plan_processor)
p = PlanCrawler(specifier, plan_downloader, plan_processor)

crawlers[school_name] = p

Expand All @@ -103,7 +129,7 @@ async def main():
argument_parser.add_argument("--only-process", action="store_true",
help="Do not download plans, only parse existing.")
argument_parser.add_argument("--ignore-exceptions", action="store_true",
help="Don't raise exceptions and crash the program, instead print them and continue.")
help="Don't raise exceptions and crash the program, instead, print them and continue.")
argument_parser.add_argument("--never-raise-out-of-proxies", action="store_true",
help="Never crash the program if no more proxies seem to be available. "
"Keep trying instead.")
Expand All @@ -117,7 +143,7 @@ async def main():
pass

logging.basicConfig(level=args.loglevel, format="[%(asctime)s] [%(levelname)8s] %(name)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S")
datefmt="%Y-%m-%d %H:%M:%S", force=True)

proxy_provider = proxies.ProxyProvider(Path("proxies.json").absolute(),
never_raise_out_of_proxies=args.never_raise_out_of_proxies)
Expand Down
50 changes: 33 additions & 17 deletions backend/plan_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
IndiwareStundenplanerClient, IndiwareMobilClient, PlanClientError, SubstitutionPlanClient, UnauthorizedError,
substitution_plan, PlanNotFoundError, StudentsSubstitutionPlanEndpoint, TeachersSubstitutionPlanEndpoint
)
from . import events

from .cache import Cache


import stundenplan24_py.client

stundenplan24_py.client.set_min_delay_between_requests(0.2)


Expand Down Expand Up @@ -45,9 +46,11 @@ def deserialize(cls, data: dict) -> PlanFileMetadata:
class PlanDownloader:
"""Check for new indiware plans in regular intervals store them in cache."""

def __init__(self, client: IndiwareStundenplanerClient, cache: Cache, *, logger: logging.Logger):
def __init__(self, school_number: str, client: IndiwareStundenplanerClient, cache: Cache, *,
logger: logging.Logger):
self._logger = logger

self.school_number = school_number
self.client = client
self.cache = cache

Expand All @@ -72,11 +75,20 @@ async def update_fetch(self) -> dict[tuple[datetime.date, datetime.datetime], li

new: set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]] = set()

# not using asyncio.gather because the logs would be confusing
for indiware_client in self.client.indiware_mobil_clients:
new |= await self.fetch_indiware_mobil(indiware_client)
with events.Timer(self.school_number, events.AllPlansDownloaded) as timer:
fetched = await asyncio.gather(
*(self.fetch_indiware_mobil(indiware_client) for indiware_client in self.client.indiware_mobil_clients),
self.fetch_substitution_plans()
)
await timer.submit_async()

for fetched_set in fetched:
new |= fetched_set

new |= await self.fetch_substitution_plans()
self.cache.store_meta_file(
json.dumps({"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat()}),
"last_fetch.json"
)

out: dict[tuple[datetime.date, datetime.datetime], list[PlanFileMetadata]] = defaultdict(list)
for date, revision, file_metadata in new:
Expand All @@ -85,8 +97,8 @@ async def update_fetch(self) -> dict[tuple[datetime.date, datetime.datetime], li
return out

async def fetch_indiware_mobil(
self,
indiware_client: IndiwareMobilClient
self,
indiware_client: IndiwareMobilClient
) -> set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]]:
try:
self._logger.debug(f"=> Fetching Indiware Mobil available files.")
Expand All @@ -102,9 +114,9 @@ async def fetch_indiware_mobil(
return await self.download_indiware_mobil(indiware_client, plan_files)

async def download_indiware_mobil(
self,
client: IndiwareMobilClient,
downloadable_plan_files: dict[str, datetime.datetime]
self,
client: IndiwareMobilClient,
downloadable_plan_files: dict[str, datetime.datetime]
) -> set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]]:
new: set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]] = set()

Expand All @@ -125,7 +137,8 @@ async def download_indiware_mobil(
else:
self._logger.info(f" -> Downloading indiware {filename!r}. Revision: {revision!s}.")

plan_response = await client.fetch_plan(filename)
with events.Timer(self.school_number, events.PlanDownload) as timer:
plan_response = await client.fetch_plan(filename)

assert plan_response.last_modified is not None
downloaded_file = PlanFileMetadata(
Expand All @@ -134,6 +147,9 @@ async def download_indiware_mobil(
etag=plan_response.etag,
)

await timer.submit_async(plan_type=plan_filename, last_modified=plan_response.last_modified,
file_length=len(plan_response.content))

self.cache.store_plan_file(date, revision, plan_response.content, plan_filename)
self.cache.store_plan_file(date, revision, json.dumps(downloaded_file.serialize()),
plan_filename + ".json")
Expand All @@ -149,8 +165,8 @@ async def fetch_substitution_plans(self) -> set[tuple[datetime.date, datetime.da
return out

async def fetch_substitution_plan(
self,
plan_client: SubstitutionPlanClient
self,
plan_client: SubstitutionPlanClient
) -> set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]]:
self._logger.debug("=> Checking for new substitution plans...")

Expand Down Expand Up @@ -197,9 +213,9 @@ def valid_date_iterator(start: datetime.date, step: int = 1):
return out

async def download_substitution_plan(
self,
plan_client: SubstitutionPlanClient,
date: datetime.date
self,
plan_client: SubstitutionPlanClient,
date: datetime.date
) -> set[tuple[datetime.date, datetime.datetime, PlanFileMetadata]]:
if isinstance(plan_client.endpoint, StudentsSubstitutionPlanEndpoint):
plan_filename = "VplanKl.xml"
Expand Down
Loading

0 comments on commit 79733c2

Please sign in to comment.