Feature/pdct 1418 Map GCF event data to new json structure (#11)

* Add raise in docstring for verify_required_fields_present * Add make command to run tests under coverage * Update docstrings & return types * Add make command for generating coverage html report * Fix comment * Declare empty df to return at end of function * Remove unused code * Remove unused code * Remove old assertion * Add output file to git ignore * Dump JSON to file with error handling * Update cspell.json * WIP for generating the mapping for GCF events data * DRY: Break event function into smaller functions * Bump to 0.1.8 * Move event enums into separate file * Use verbose pytest * Create tests for initialise_event_counter * Update cspell.json * Create test_append_event.py * Create test_check_event_dates.py * Create test_process_event.py * Fix test_event tests
climatepolicyradar · Sep 4, 2024 · cf13a5e · cf13a5e
1 parent bab7919
commit cf13a5e
Show file tree

Hide file tree

Showing 18 changed files with 528 additions and 123 deletions.
diff --git a/.gitignore b/.gitignore
@@ -174,3 +174,6 @@ plugins
 user_trunk.yaml
 user.yaml
 tmp
+
+# Output files
+output.json
diff --git a/.trunk/configs/cspell.json b/.trunk/configs/cspell.json
@@ -35,7 +35,8 @@
     "iloc",
     "iterrows",
     "notna",
-    "conftest"
+    "conftest",
+    "capsys"
   ],
   "flagWords": ["hte"],
   "suggestionsTimeout": 5000

diff --git a/Makefile b/Makefile
@@ -41,4 +41,11 @@ build: ## Build the project
 	poetry build
 
 test: ## Run tests using pytest
-	poetry run pytest -v
+	poetry run pytest -vvv
+
+test_coverage: ## Run tests using pytest with coverage
+	poetry run coverage run -m pytest -vvv tests
+	coverage report
+
+test_coverage_html: test_coverage ## Run tests using pytest with coverage and generate a HTML report
+	coverage report
diff --git a/gcf_data_mapper/cli.py b/gcf_data_mapper/cli.py
@@ -1,3 +1,4 @@
+import json
 import os
 import sys
 from typing import Any, Optional
@@ -111,6 +112,13 @@ def dump_output(
     if debug:
         click.echo(f"📝 Output file {click.format_filename(output_file)}")
 
+    try:
+        with open(output_file, "w+", encoding="utf-8") as f:
+            json.dump(mapped_data, f, ensure_ascii=False, indent=2)
+    except Exception as e:
+        click.echo(f"❌ Failed to dump JSON to file. Error: {e}.")
+        sys.exit(1)
+
 
 if __name__ == "__main__":
     entrypoint()
diff --git a/gcf_data_mapper/enums/event.py b/gcf_data_mapper/enums/event.py
@@ -0,0 +1,40 @@
+from collections import namedtuple
+from enum import Enum
+
+Event = namedtuple("event", ["name", "type", "column_name"])
+
+
+class EventColumnNames(Enum):
+    """The fields the GCF data mapper needs to parse event data."""
+
+    APPROVED = "ApprovalDate"
+    UNDER_IMPLEMENTATION = "StartDate"
+    COMPLETED = "DateCompletion"
+    APPROVED_REF = "ApprovedRef"
+    PROJECTS_ID = "ProjectsID"
+
+
+class EventTypeNames(Enum):
+    """The GCF event type names (should map to the GCF taxonomy)."""
+
+    APPROVED = "Approved"
+    UNDER_IMPLEMENTATION = "Under Implementation"
+    COMPLETED = "Completed"
+
+
+class Events:
+    APPROVED = Event(
+        "approved",
+        EventTypeNames.APPROVED.value,
+        EventColumnNames.APPROVED.value,
+    )
+    UNDER_IMPLEMENTATION = Event(
+        "under_implementation",
+        EventTypeNames.UNDER_IMPLEMENTATION.value,
+        EventColumnNames.UNDER_IMPLEMENTATION.value,
+    )
+    COMPLETED = Event(
+        "completed",
+        EventTypeNames.COMPLETED.value,
+        EventColumnNames.COMPLETED.value,
+    )
diff --git a/gcf_data_mapper/parsers/event.py b/gcf_data_mapper/parsers/event.py
@@ -1,16 +1,117 @@
-from enum import Enum
 from typing import Any, Optional
 
 import click
 import pandas as pd
 
+from gcf_data_mapper.enums.event import Event, EventColumnNames, Events
 from gcf_data_mapper.parsers.helpers import verify_required_fields_present
 
 
-class RequiredColumns(Enum):
-    APPROVED = "ApprovalDate"
-    UNDER_IMPLEMENTATION = "StartDate"
-    COMPLETED = "DateCompletion"
+def append_event(
+    gcf_events: list,
+    event: Event,
+    row: pd.Series,
+    approved_ref: str,
+    projects_id: str,
+    n_value: int,
+) -> None:
+    """Append an event to the master list that is passed in.
+
+    Remember, because lists are mutable in Python, any changes to the
+    list inside a function will be reflected outside of it as a
+    reference to the object is passed instead of just the value.
+
+    :param list gcf_events: The list of GCF events.
+    :param Event event: The event to append.
+    :param pd.Series row: The row of data containing GCF event info.
+        Each row corresponds to a GCF 'family'.
+    :param str approved_ref: The FP number.
+    :param str projects_id: The GCF projects ID.
+    :param int n_value: The event number for the given GCF family.
+    """
+    gcf_events.append(
+        {
+            "import_id": f"GCF.event.{approved_ref}_{projects_id}.n{n_value:04}",
+            "family_import_id": f"GCF.event.{approved_ref}.{projects_id}",
+            "event_title": event.type,
+            "date": row[event.column_name],
+            "event_type_value": event.type,
+        }
+    )
+
+
+def check_event_dates(row: pd.Series) -> dict[str, bool]:
+    """Check if the row contains valid event date values (not NA).
+
+    :param pd.Series row: The row of data to check.
+    :return dict[str, bool]: A dict indicating the presence of each
+        event date.
+    """
+    return {
+        Events.APPROVED.name: pd.notna(row.at[Events.APPROVED.column_name]),
+        Events.UNDER_IMPLEMENTATION.name: pd.notna(
+            row.at[Events.UNDER_IMPLEMENTATION.column_name]
+        ),
+        Events.COMPLETED.name: pd.notna(row.at[Events.COMPLETED.column_name]),
+    }
+
+
+def initialise_event_counter(
+    event_counter: dict[str, int], family_import_id: str
+) -> None:
+    """Initialise the event counter for a family_import_id if not present.
+
+    Remember, because dictionaries are mutable in Python, any changes to
+    the dictionary inside a function will be reflected outside of it as
+    a reference to the object is passed instead of just the value.
+
+    :param dict[str, int] event_counter: The event counter dictionary
+        containing each family_import_id as a key and its corresponding
+        counter of events.
+    :param str family_import_id: The family import ID to initialise an
+        event counter for.
+    """
+    if family_import_id not in event_counter:
+        event_counter[family_import_id] = 0
+
+
+def process_event(
+    row: pd.Series,
+    gcf_events: list,
+    event_counter: dict,
+    approved_ref: str,
+    projects_id: str,
+) -> None:
+    """Process a row to append events and update the event counter.
+
+    :param pd.Series row: The row of data to process (corresponds to a
+        GCF family).
+    :param list gcf_events: The master list of already processed GCF
+        events.
+    :param dict event_counter: The event counter dictionary.
+    :param str approved_ref: The FP number.
+    :param str projects_id: The GCF projects ID.
+    """
+    family_import_id = f"GCF.event.{approved_ref}.{projects_id}"
+    initialise_event_counter(event_counter, family_import_id)
+
+    event_dates = check_event_dates(row)
+    if not any(event_dates.values()):
+        click.echo(f"🛑 No event dates found for {approved_ref} {projects_id}.")
+        return
+
+    for event_name, has_event in event_dates.items():
+        if has_event:
+            event = getattr(Events, event_name.upper())
+            append_event(
+                gcf_events,
+                event,
+                row,
+                approved_ref,
+                projects_id,
+                event_counter[family_import_id],
+            )
+            event_counter[family_import_id] += 1
 
 
 def event(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
@@ -26,7 +127,15 @@ def event(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, A
     if debug:
         click.echo("📝 Wrangling GCF event data.")
 
-    required_fields = set(str(e.value) for e in RequiredColumns)
+    required_fields = set(str(e.value) for e in EventColumnNames)
     verify_required_fields_present(projects_data, required_fields)
 
-    return []
+    gcf_events = []
+    event_counter = {}
+
+    for _, row in projects_data.iterrows():
+        approved_ref = row.at[EventColumnNames.APPROVED_REF.value]
+        projects_id = row.at[EventColumnNames.PROJECTS_ID.value]
+        process_event(row, gcf_events, event_counter, approved_ref, projects_id)
+
+    return gcf_events
diff --git a/gcf_data_mapper/read.py b/gcf_data_mapper/read.py
@@ -29,9 +29,9 @@ def read_csv_pd(
     :param Optional[int] chunk_size: The number of lines to read into
         memory in each batch iteratively. Defaults to 10**4.
 
-    :return Optional[pd.DataFrame]: A Pandas DataFrame containing the
-        CSV data if the file is successfully found and parsed by the
-        Pandas CSV reader. Otherwise this function will return None.
+    :return pd.DataFrame: A Pandas DataFrame containing the CSV data if
+        the file is successfully found and parsed by the Pandas CSV
+        reader. Otherwise an empty DataFrame will be returned.
     """
     # Should the path exist, read the CSV contents into memory iteratively in chunks of
     # 'chunk_size' (10**4 by default).
@@ -50,30 +50,28 @@ def read_csv_pd(
         return dataset
 
     except Exception as e:
-        print(e)
-        click.echo("Error occurred reading CSV file using Pandas: %s" % file_path)
+        click.echo(f"❌ Error reading file {file_path}: {e}")
 
     return pd.DataFrame([])
 
 
-def read_json_pd(file_path: str):
+def read_json_pd(file_path: str) -> pd.DataFrame:
     """Load the data from the specified JSON file into a Pandas DF.
 
     :param str file_path: The filepath passed by the user to the
         tool.
 
-    :return Optional[pd.DataFrame]: A Pandas DataFrame containing the
-        JSON data if the file is successfully found and parsed by the
-        Pandas JSON reader. Otherwise this function will return None.
+    :return pd.DataFrame: A Pandas DataFrame containing the CSV data if
+        the file is successfully found and parsed by the Pandas CSV
+        reader. Otherwise an empty DataFrame will be returned.
     """
+    df = pd.DataFrame([])
     try:
         with open(file_path, "r") as file:
             df = pd.json_normalize(json.load(file))
-        return df
     except Exception as e:
-        print(e)
-        click.echo("Error occurred reading JSON file using Pandas: %s" % file_path)
-    return pd.DataFrame([])
+        click.echo(f"❌ Error reading file {file_path}: {e}")
+    return df
 
 
 def read_into_pandas(file_path: str, debug: bool = False) -> pd.DataFrame:
@@ -99,18 +97,16 @@ def read_into_pandas(file_path: str, debug: bool = False) -> pd.DataFrame:
     if file_extension not in [e.value for e in AllowedFileExtensions]:
         raise ValueError("Error reading file: File must be a valid json or csv file")
 
-    if os.path.getsize(file_path) == 0 and debug:
-        click.echo(f"File '{file_path}' is empty")
-
     df = pd.DataFrame([])
-    try:
-        if file_extension == AllowedFileExtensions.CSV.value:
-            return read_csv_pd(file_path)
-        if file_extension == AllowedFileExtensions.JSON.value:
-            return read_json_pd(file_path)
-    except Exception as e:
-        click.echo(f"Error reading file: {e}")
-        raise e
+
+    if os.path.getsize(file_path) == 0:
+        return df
+
+    if file_extension == AllowedFileExtensions.CSV.value:
+        df = read_csv_pd(file_path)
+
+    elif file_extension == AllowedFileExtensions.JSON.value:
+        df = read_json_pd(file_path)
 
     return df