diff --git a/.gitignore b/.gitignore index 2dc53ca..d24991a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,10 @@ -# Byte-compiled / optimized / DLL files +# Byte-compiled / optimized / DLL / Data files __pycache__/ *.py[cod] *$py.class +*.txt +*.xlsx +data/oews/raw_data # C extensions *.so diff --git a/data/oews/loader/loader.py b/data/oews/loader/loader.py new file mode 100644 index 0000000..9e86a1c --- /dev/null +++ b/data/oews/loader/loader.py @@ -0,0 +1,21 @@ +from data.onet.models.mapping import * +from data.oews.loader.mapping import * + +from motor.motor_asyncio import AsyncIOMotorClient +from beanie import init_beanie +import asyncio + + +async def load_mapping(): + await load_job_salary() + + +async def load_oews_data(): + client = AsyncIOMotorClient('mongodb+srv://uofteceelcano:Dd8TbR5VaPkzuFVU@elcanocluster.bwllguc.mongodb.net/?retryWrites=true&w=majority') + await init_beanie(database=client['lighthouse'], document_models=[Mapping]) + + await load_mapping() + + +if __name__ == "__main__": + asyncio.run(load_oews_data()) diff --git a/data/oews/loader/mapping.py b/data/oews/loader/mapping.py new file mode 100644 index 0000000..9297b73 --- /dev/null +++ b/data/oews/loader/mapping.py @@ -0,0 +1,29 @@ +import pandas as pd +from data.oews.models.mapping import * + +async def load_job_salary(): + base = 'data/oews/raw_data/' + df = pd.read_excel(base + 'oesm22all/all_data_M_2022.xlsx') + df.replace(['*', '#', '**', '~'], None, inplace=True) + + items = [] + for _, row in df.iterrows(): + # there are repetitioins in the OCC_TITLE column with only different O_GROUP + try: + if row['OCC_TITLE'] == items[-1].job_title: + continue + except: + pass + + item = Mapping( + src_element_id=row['OCC_CODE'], + job_title=row['OCC_TITLE'], + hourly_salary=row['H_MEAN'], + annual_salary=row['A_MEAN'], + hourly_median=row['H_MEDIAN'], + annual_median=row['A_MEDIAN'], + total_employment=row['TOT_EMP'] + ) + items.append(item) + + await Mapping.insert_many(items) diff --git a/data/oews/models/mapping.py b/data/oews/models/mapping.py new file mode 100644 index 0000000..ebf8298 --- /dev/null +++ b/data/oews/models/mapping.py @@ -0,0 +1,18 @@ +from beanie import Document +from typing import Optional + +class Mapping(Document): + class Settings: + name = "job_salary" + + src_element_id: str + job_title: str + hourly_salary: Optional[float] + annual_salary: Optional[int] + hourly_median: Optional[float] + annual_median: Optional[int] + total_employment: Optional[int] + + +def get_mapping_models(): + return [Mapping]