Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/pdct 1446 add family status to family metadata #16

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 67 additions & 5 deletions gcf_data_mapper/parsers/family.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import Any, Iterable, Optional
from typing import Any, Iterable, Optional, Union

import click
import pandas as pd

from gcf_data_mapper.enums.event import EventColumnNames, Events
from gcf_data_mapper.enums.event import EventColumnNames, Events
from gcf_data_mapper.enums.family import (
FamilyColumnsNames,
Expand Down Expand Up @@ -58,12 +60,56 @@ def calculate_status(row: pd.Series) -> Optional[str]:


def get_budgets(funding_list: list[dict], source: str) -> Optional[list[int]]:
def contains_invalid_date_entries(list_of_dates: Iterable[pd.Timestamp]) -> bool:
"""Check if any of the values in the list of dates are NaT (Not a Time).

:param Iterable[pd.Timestamp] list_of_dates: A list of pd.TimeStamps, may also include NoneTypes
:return bool: True if any of the values are not a valid timestamp. This helps distinguish between NaT and NaN/None Type values which are valid date entries.
"""
return any(date is pd.NaT for date in list_of_dates)


def calculate_status(row: pd.Series) -> Optional[str]:
"""Calculate status of project based on the event types and dates
The status is calculated per the below:
Completed : (NOW is passed date-completion)
Under implementation : (NOW is passed start-date)
Approved : (NOW is passed approved-date)

:param pd.Series row: The row containing the event information
:return Optional[str]: The status of the project, if there are no valid values return None
"""
completed_date = pd.to_datetime(row.at[Events.COMPLETED.column_name])
start_date = pd.to_datetime(row.at[Events.UNDER_IMPLEMENTATION.column_name])
approved_date = pd.to_datetime(row.at[Events.APPROVED.column_name])

if contains_invalid_date_entries([completed_date, start_date, approved_date]):
click.echo("- Row contains invalid date entries")
return None

now = pd.Timestamp.now(tz="UTC")

# This block is arranged to reflect the project lifecycle in reverse order, from the final stage to the initial stage.
if pd.notna(completed_date) and now >= completed_date:
return Events.COMPLETED.type
if pd.notna(start_date) and now >= start_date:
return Events.UNDER_IMPLEMENTATION.type
if pd.notna(approved_date) and now >= approved_date:
return Events.APPROVED.type

click.echo("- Row missing event date information to calculate status")
return None


def get_budgets(
funding_list: list[dict], source: str
) -> Optional[list[Union[int, float]]]:
"""Get the budget amount from the row based on the funding source.

:param list[dict] row: A list of all the funding information, represented in dictionaries
:param str source: The funding source to retrieve the budget from.

:return Optional[list[int]]: A list of budget amounts corresponding to the source,
:return Optional[list[Union[int, float]]]: A list of budget amounts corresponding to the source,
or [0] if the source is not found.
"""

Expand All @@ -75,8 +121,8 @@ def get_budgets(funding_list: list[dict], source: str) -> Optional[list[int]]:
]

# Check for any invalid values
if any(not isinstance(budget, (int)) for budget in budgets):
click.echo("🛑 Funding entries does not have valid int budget values")
if any(not isinstance(budget, (int, float)) for budget in budgets):
click.echo("- Funding entries does not have valid (int, float) budget values")
return None

# Where we have projects which have been solely funded by the fund (GCF), or solely co-financed
Expand All @@ -95,6 +141,11 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:

status = calculate_status(row)

if status is None:
return None

status = calculate_status(row)

if status is None:
return None

Expand Down Expand Up @@ -148,6 +199,7 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
"result_types": list(set(types)),
"sector": [row.at[FamilyColumnsNames.SECTOR.value]],
"status": status,
"status": status,
"theme": [row.at[FamilyColumnsNames.THEME.value]],
}

Expand All @@ -164,16 +216,18 @@ def map_family_data(
"""

family_metadata = map_family_metadata(row)
projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]

# When processing the family metadata if there are any empty/falsy values we return None
# and skip the row. Therefore we don't want to process the rest of the family data so we
# return None in this conditional.
if family_metadata is None:
click.echo("🛑 Skipping row as family metadata has missing information")
click.echo(
f"🛑 Skipping row as family metadata has missing information, see ProjectsID : {projects_id}"
)
return None

approved_ref = row.at[FamilyColumnsNames.APPROVED_REF.value]
projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
summary = row.at[FamilyColumnsNames.SUMMARY.value]
title = row.at[FamilyColumnsNames.TITLE.value]

Expand Down Expand Up @@ -244,6 +298,8 @@ def family(

mapped_families = []

family_columns = set(str(e.value) for e in FamilyColumnsNames)
required_fields = family_columns.union(set(str(e.value) for e in EventColumnNames))
family_columns = set(str(e.value) for e in FamilyColumnsNames)
required_fields = family_columns.union(set(str(e.value) for e in EventColumnNames))

Expand All @@ -254,6 +310,12 @@ def family(
required_fields -= set(
str(e.value) for e in EventColumnNames if str(e.value) not in family_columns
)
# Whilst we expect the event columns to be present, some of the events in the data may have empty values.
# We therefore want to exclude these from the `row_contains_columns_with_empty_values` function,
# and handle any empty event values in the `calculate_status` function.
required_fields -= set(
str(e.value) for e in EventColumnNames if str(e.value) not in family_columns
)

for _, row in gcf_projects_data.iterrows():
projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
Expand Down
15 changes: 12 additions & 3 deletions tests/unit_tests/parsers/family/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def mock_family_doc_df():
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
]
)
Expand Down Expand Up @@ -100,6 +103,9 @@ def mock_family_row_ds():
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
)

Expand Down Expand Up @@ -146,6 +152,9 @@ def mock_family_row_no_result_areas():
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
)

Expand Down Expand Up @@ -192,7 +201,7 @@ def mock_family_row_no_entities_no_regions():


@pytest.fixture()
def mock_family_row_with_non_int_budget_values():
def mock_family_row_with_non_int_non_float_budget_values():
yield pd.Series(
{
"ProjectsID": 3,
Expand All @@ -214,8 +223,8 @@ def mock_family_row_with_non_int_budget_values():
},
{
"Source": "Co-Financing",
"Budget": 620000.20,
"BudgetUSDeq": 620000.50,
"Budget": "620000.20",
"BudgetUSDeq": "620000.50",
},
],
"ResultAreas": [
Expand Down
6 changes: 4 additions & 2 deletions tests/unit_tests/parsers/family/test_map_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,17 @@ def test_skips_processing_row_if_row_contains_empty_values(


def test_skips_processing_row_if_family_metadata_has_missing_data(
mock_family_row_no_result_areas, capsys
mock_family_row_no_result_areas: pd.Series, capsys
):
projects_id = mock_family_row_no_result_areas.ProjectsID

family_data = map_family_data(mock_family_row_no_result_areas)
assert family_data is None
captured = capsys.readouterr()
# We have two outputs, one from map_family_metadata pointing to the missing data and the second
# from map_family_data informing that the row is being skipped
map_family_data_output = captured.out.strip().split("\n")
assert (
"🛑 Skipping row as family metadata has missing information"
f"🛑 Skipping row as family metadata has missing information, see ProjectsID : {projects_id}"
== map_family_data_output[1]
)
19 changes: 14 additions & 5 deletions tests/unit_tests/parsers/family/test_map_family_metadata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Optional

from typing import Optional

import pandas as pd
import pytest

Expand All @@ -10,6 +12,13 @@
get_budgets,
map_family_metadata,
)
from gcf_data_mapper.enums.event import Events
from gcf_data_mapper.parsers.family import (
calculate_status,
contains_invalid_date_entries,
get_budgets,
map_family_metadata,
)


@pytest.fixture()
Expand Down Expand Up @@ -129,9 +138,9 @@ def test_returns_expected_value_when_parsing_budget_data(


def test_map_family_metadata_returns_none_if_budget_does_not_contain_valid_int_types(
mock_family_row_with_non_int_budget_values: pd.Series,
mock_family_row_with_non_int_non_float_budget_values: pd.Series,
):
result = map_family_metadata(mock_family_row_with_non_int_budget_values)
result = map_family_metadata(mock_family_row_with_non_int_non_float_budget_values)
assert result is None


Expand Down Expand Up @@ -246,7 +255,7 @@ def test_dates_contain_invalid_date_entries(list_of_dates: list, return_value):
"DateCompletion": pd.NA,
}
),
"🛑 Row contains invalid date entries",
"- Row contains invalid date entries",
),
(
pd.Series(
Expand All @@ -256,7 +265,7 @@ def test_dates_contain_invalid_date_entries(list_of_dates: list, return_value):
"DateCompletion": "",
}
),
"🛑 Row contains invalid date entries",
"- Row contains invalid date entries",
),
(
pd.Series(
Expand All @@ -266,7 +275,7 @@ def test_dates_contain_invalid_date_entries(list_of_dates: list, return_value):
"DateCompletion": None,
}
),
"🛑 Row missing event date information to calculate status",
"- Row missing event date information to calculate status",
),
],
)
Expand Down
Loading