Skip to content

Commit

Permalink
Feature/pdct 1418 Map GCF event data to new json structure (#11)
Browse files Browse the repository at this point in the history
* Add raise in docstring for verify_required_fields_present

* Add make command to run tests under coverage

* Update docstrings & return types

* Add make command for generating coverage html report

* Fix comment

* Declare empty df to return at end of function

* Remove unused code

* Remove unused code

* Remove old assertion

* Add output file to git ignore

* Dump JSON to file with error handling

* Update cspell.json

* WIP for generating the mapping for GCF events data

* DRY: Break event function into smaller functions

* Bump to 0.1.8

* Move event enums into separate file

* Use verbose pytest

* Create tests for initialise_event_counter

* Update cspell.json

* Create test_append_event.py

* Create test_check_event_dates.py

* Create test_process_event.py

* Fix test_event tests
  • Loading branch information
katybaulch authored Sep 4, 2024
1 parent bab7919 commit cf13a5e
Show file tree
Hide file tree
Showing 18 changed files with 528 additions and 123 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -174,3 +174,6 @@ plugins
user_trunk.yaml
user.yaml
tmp

# Output files
output.json
3 changes: 2 additions & 1 deletion .trunk/configs/cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
"iloc",
"iterrows",
"notna",
"conftest"
"conftest",
"capsys"
],
"flagWords": ["hte"],
"suggestionsTimeout": 5000
Expand Down
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,11 @@ build: ## Build the project
poetry build

test: ## Run tests using pytest
poetry run pytest -v
poetry run pytest -vvv

test_coverage: ## Run tests using pytest with coverage
poetry run coverage run -m pytest -vvv tests
coverage report

test_coverage_html: test_coverage ## Run tests using pytest with coverage and generate a HTML report
coverage report
8 changes: 8 additions & 0 deletions gcf_data_mapper/cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
import sys
from typing import Any, Optional
Expand Down Expand Up @@ -111,6 +112,13 @@ def dump_output(
if debug:
click.echo(f"📝 Output file {click.format_filename(output_file)}")

try:
with open(output_file, "w+", encoding="utf-8") as f:
json.dump(mapped_data, f, ensure_ascii=False, indent=2)
except Exception as e:
click.echo(f"❌ Failed to dump JSON to file. Error: {e}.")
sys.exit(1)


if __name__ == "__main__":
entrypoint()
40 changes: 40 additions & 0 deletions gcf_data_mapper/enums/event.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from collections import namedtuple
from enum import Enum

Event = namedtuple("event", ["name", "type", "column_name"])


class EventColumnNames(Enum):
"""The fields the GCF data mapper needs to parse event data."""

APPROVED = "ApprovalDate"
UNDER_IMPLEMENTATION = "StartDate"
COMPLETED = "DateCompletion"
APPROVED_REF = "ApprovedRef"
PROJECTS_ID = "ProjectsID"


class EventTypeNames(Enum):
"""The GCF event type names (should map to the GCF taxonomy)."""

APPROVED = "Approved"
UNDER_IMPLEMENTATION = "Under Implementation"
COMPLETED = "Completed"


class Events:
APPROVED = Event(
"approved",
EventTypeNames.APPROVED.value,
EventColumnNames.APPROVED.value,
)
UNDER_IMPLEMENTATION = Event(
"under_implementation",
EventTypeNames.UNDER_IMPLEMENTATION.value,
EventColumnNames.UNDER_IMPLEMENTATION.value,
)
COMPLETED = Event(
"completed",
EventTypeNames.COMPLETED.value,
EventColumnNames.COMPLETED.value,
)
123 changes: 116 additions & 7 deletions gcf_data_mapper/parsers/event.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,117 @@
from enum import Enum
from typing import Any, Optional

import click
import pandas as pd

from gcf_data_mapper.enums.event import Event, EventColumnNames, Events
from gcf_data_mapper.parsers.helpers import verify_required_fields_present


class RequiredColumns(Enum):
APPROVED = "ApprovalDate"
UNDER_IMPLEMENTATION = "StartDate"
COMPLETED = "DateCompletion"
def append_event(
gcf_events: list,
event: Event,
row: pd.Series,
approved_ref: str,
projects_id: str,
n_value: int,
) -> None:
"""Append an event to the master list that is passed in.
Remember, because lists are mutable in Python, any changes to the
list inside a function will be reflected outside of it as a
reference to the object is passed instead of just the value.
:param list gcf_events: The list of GCF events.
:param Event event: The event to append.
:param pd.Series row: The row of data containing GCF event info.
Each row corresponds to a GCF 'family'.
:param str approved_ref: The FP number.
:param str projects_id: The GCF projects ID.
:param int n_value: The event number for the given GCF family.
"""
gcf_events.append(
{
"import_id": f"GCF.event.{approved_ref}_{projects_id}.n{n_value:04}",
"family_import_id": f"GCF.event.{approved_ref}.{projects_id}",
"event_title": event.type,
"date": row[event.column_name],
"event_type_value": event.type,
}
)


def check_event_dates(row: pd.Series) -> dict[str, bool]:
"""Check if the row contains valid event date values (not NA).
:param pd.Series row: The row of data to check.
:return dict[str, bool]: A dict indicating the presence of each
event date.
"""
return {
Events.APPROVED.name: pd.notna(row.at[Events.APPROVED.column_name]),
Events.UNDER_IMPLEMENTATION.name: pd.notna(
row.at[Events.UNDER_IMPLEMENTATION.column_name]
),
Events.COMPLETED.name: pd.notna(row.at[Events.COMPLETED.column_name]),
}


def initialise_event_counter(
event_counter: dict[str, int], family_import_id: str
) -> None:
"""Initialise the event counter for a family_import_id if not present.
Remember, because dictionaries are mutable in Python, any changes to
the dictionary inside a function will be reflected outside of it as
a reference to the object is passed instead of just the value.
:param dict[str, int] event_counter: The event counter dictionary
containing each family_import_id as a key and its corresponding
counter of events.
:param str family_import_id: The family import ID to initialise an
event counter for.
"""
if family_import_id not in event_counter:
event_counter[family_import_id] = 0


def process_event(
row: pd.Series,
gcf_events: list,
event_counter: dict,
approved_ref: str,
projects_id: str,
) -> None:
"""Process a row to append events and update the event counter.
:param pd.Series row: The row of data to process (corresponds to a
GCF family).
:param list gcf_events: The master list of already processed GCF
events.
:param dict event_counter: The event counter dictionary.
:param str approved_ref: The FP number.
:param str projects_id: The GCF projects ID.
"""
family_import_id = f"GCF.event.{approved_ref}.{projects_id}"
initialise_event_counter(event_counter, family_import_id)

event_dates = check_event_dates(row)
if not any(event_dates.values()):
click.echo(f"🛑 No event dates found for {approved_ref} {projects_id}.")
return

for event_name, has_event in event_dates.items():
if has_event:
event = getattr(Events, event_name.upper())
append_event(
gcf_events,
event,
row,
approved_ref,
projects_id,
event_counter[family_import_id],
)
event_counter[family_import_id] += 1


def event(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
Expand All @@ -26,7 +127,15 @@ def event(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, A
if debug:
click.echo("📝 Wrangling GCF event data.")

required_fields = set(str(e.value) for e in RequiredColumns)
required_fields = set(str(e.value) for e in EventColumnNames)
verify_required_fields_present(projects_data, required_fields)

return []
gcf_events = []
event_counter = {}

for _, row in projects_data.iterrows():
approved_ref = row.at[EventColumnNames.APPROVED_REF.value]
projects_id = row.at[EventColumnNames.PROJECTS_ID.value]
process_event(row, gcf_events, event_counter, approved_ref, projects_id)

return gcf_events
44 changes: 20 additions & 24 deletions gcf_data_mapper/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def read_csv_pd(
:param Optional[int] chunk_size: The number of lines to read into
memory in each batch iteratively. Defaults to 10**4.
:return Optional[pd.DataFrame]: A Pandas DataFrame containing the
CSV data if the file is successfully found and parsed by the
Pandas CSV reader. Otherwise this function will return None.
:return pd.DataFrame: A Pandas DataFrame containing the CSV data if
the file is successfully found and parsed by the Pandas CSV
reader. Otherwise an empty DataFrame will be returned.
"""
# Should the path exist, read the CSV contents into memory iteratively in chunks of
# 'chunk_size' (10**4 by default).
Expand All @@ -50,30 +50,28 @@ def read_csv_pd(
return dataset

except Exception as e:
print(e)
click.echo("Error occurred reading CSV file using Pandas: %s" % file_path)
click.echo(f"❌ Error reading file {file_path}: {e}")

return pd.DataFrame([])


def read_json_pd(file_path: str):
def read_json_pd(file_path: str) -> pd.DataFrame:
"""Load the data from the specified JSON file into a Pandas DF.
:param str file_path: The filepath passed by the user to the
tool.
:return Optional[pd.DataFrame]: A Pandas DataFrame containing the
JSON data if the file is successfully found and parsed by the
Pandas JSON reader. Otherwise this function will return None.
:return pd.DataFrame: A Pandas DataFrame containing the CSV data if
the file is successfully found and parsed by the Pandas CSV
reader. Otherwise an empty DataFrame will be returned.
"""
df = pd.DataFrame([])
try:
with open(file_path, "r") as file:
df = pd.json_normalize(json.load(file))
return df
except Exception as e:
print(e)
click.echo("Error occurred reading JSON file using Pandas: %s" % file_path)
return pd.DataFrame([])
click.echo(f"❌ Error reading file {file_path}: {e}")
return df


def read_into_pandas(file_path: str, debug: bool = False) -> pd.DataFrame:
Expand All @@ -99,18 +97,16 @@ def read_into_pandas(file_path: str, debug: bool = False) -> pd.DataFrame:
if file_extension not in [e.value for e in AllowedFileExtensions]:
raise ValueError("Error reading file: File must be a valid json or csv file")

if os.path.getsize(file_path) == 0 and debug:
click.echo(f"File '{file_path}' is empty")

df = pd.DataFrame([])
try:
if file_extension == AllowedFileExtensions.CSV.value:
return read_csv_pd(file_path)
if file_extension == AllowedFileExtensions.JSON.value:
return read_json_pd(file_path)
except Exception as e:
click.echo(f"Error reading file: {e}")
raise e

if os.path.getsize(file_path) == 0:
return df

if file_extension == AllowedFileExtensions.CSV.value:
df = read_csv_pd(file_path)

elif file_extension == AllowedFileExtensions.JSON.value:
df = read_json_pd(file_path)

return df

Expand Down
Loading

0 comments on commit cf13a5e

Please sign in to comment.