Skip to content

Commit

Permalink
feat: add function to calculate status of a project based on events (#15
Browse files Browse the repository at this point in the history
)

* feat: add function to calculate status of a project based on events

* refactor: move nonetype budget check further up the chain in the block

* refactor: update budget type checker to expect ints only

---------

Co-authored-by: Osneil Drakes <[email protected]>
Co-authored-by: Osneil Drakes <[email protected]>
  • Loading branch information
3 people authored Sep 11, 2024
1 parent 668f12e commit b3348a2
Show file tree
Hide file tree
Showing 4 changed files with 285 additions and 7 deletions.
72 changes: 67 additions & 5 deletions gcf_data_mapper/parsers/family.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import Any, Optional
from typing import Any, Iterable, Optional

import click
import pandas as pd

from gcf_data_mapper.enums.event import EventColumnNames, Events
from gcf_data_mapper.enums.family import (
FamilyColumnsNames,
FamilyNestedColumnNames,
Expand All @@ -15,13 +16,54 @@
)


def get_budgets(funding_list: list[dict], source: str) -> list[int]:
def contains_invalid_date_entries(list_of_dates: Iterable[pd.Timestamp]) -> bool:
"""Check if any of the values in the list of dates are NaT (Not a Time).
:param Iterable[pd.Timestamp] list_of_dates: A list of pd.TimeStamps, may also include NoneTypes
:return bool: True if any of the values are not a valid timestamp. This helps distinguish between NaT and NaN/None Type values which are valid date entries.
"""
return any(date is pd.NaT for date in list_of_dates)


def calculate_status(row: pd.Series) -> Optional[str]:
"""Calculate status of project based on the event types and dates
The status is calculated per the below:
Completed : (NOW is passed date-completion)
Under implementation : (NOW is passed start-date)
Approved : (NOW is passed approved-date)
:param pd.Series row: The row containing the event information
:return Optional[str]: The status of the project, if there are no valid values return None
"""
completed_date = pd.to_datetime(row.at[Events.COMPLETED.column_name])
start_date = pd.to_datetime(row.at[Events.UNDER_IMPLEMENTATION.column_name])
approved_date = pd.to_datetime(row.at[Events.APPROVED.column_name])

if contains_invalid_date_entries([completed_date, start_date, approved_date]):
click.echo("🛑 Row contains invalid date entries")
return None

now = pd.Timestamp.now(tz="UTC")

# This block is arranged to reflect the project lifecycle in reverse order, from the final stage to the initial stage.
if pd.notna(completed_date) and now >= completed_date:
return Events.COMPLETED.type
if pd.notna(start_date) and now >= start_date:
return Events.UNDER_IMPLEMENTATION.type
if pd.notna(approved_date) and now >= approved_date:
return Events.APPROVED.type

click.echo("🛑 Row missing event date information to calculate status")
return None


def get_budgets(funding_list: list[dict], source: str) -> Optional[list[int]]:
"""Get the budget amount from the row based on the funding source.
:param list[dict] row: A list of all the funding information, represented in dictionaries
:param str source: The funding source to retrieve the budget from.
:return list[int]: A list of budget amounts corresponding to the source,
:return Optional[list[int]]: A list of budget amounts corresponding to the source,
or [0] if the source is not found.
"""

Expand All @@ -32,6 +74,11 @@ def get_budgets(funding_list: list[dict], source: str) -> list[int]:
funding[budget_key] for funding in funding_list if funding[source_key] == source
]

# Check for any invalid values
if any(not isinstance(budget, (int)) for budget in budgets):
click.echo("🛑 Funding entries does not have valid int budget values")
return None

# Where we have projects which have been solely funded by the fund (GCF), or solely co-financed
# - so in instances where there will be no funding that match either the GCF or co-financing
# source value, we will map the `project_value_fund spend` or the `project_value_co_financing`
Expand All @@ -46,6 +93,11 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
:return Optional[dict]: A dictionary containing mapped metadata for the family.
"""

status = calculate_status(row)

if status is None:
return None

countries = row.at[FamilyColumnsNames.COUNTRIES.value]
entities = row.at[FamilyColumnsNames.ENTITIES.value]
funding_sources = row.at[FamilyColumnsNames.FUNDING.value]
Expand All @@ -61,6 +113,9 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
)
gcf_budgets = get_budgets(funding_sources, GCFProjectBudgetSource.GCF.value)

if gcf_budgets is None or co_financing_budgets is None:
return None

implementing_agencies = [entity[name_key] for entity in entities]
regions = [country[region_key] for country in countries]
areas = [result[area_key] for result in result_areas]
Expand Down Expand Up @@ -92,6 +147,7 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
"result_areas": list(set(areas)),
"result_types": list(set(types)),
"sector": [row.at[FamilyColumnsNames.SECTOR.value]],
"status": status,
"theme": [row.at[FamilyColumnsNames.THEME.value]],
}

Expand Down Expand Up @@ -188,10 +244,16 @@ def family(

mapped_families = []

required_fields = set(str(e.value) for e in FamilyColumnsNames)
family_columns = set(str(e.value) for e in FamilyColumnsNames)
required_fields = family_columns.union(set(str(e.value) for e in EventColumnNames))

verify_required_fields_present(gcf_projects_data, required_fields)
# Do a check that the projects data has the field you need
# Whilst we expect the event columns to be present, some of the events in the data may have empty values.
# We therefore want to exclude these from the `row_contains_columns_with_empty_values` function,
# and handle any empty event values in the `calculate_status` function.
required_fields -= set(
str(e.value) for e in EventColumnNames if str(e.value) not in family_columns
)

for _, row in gcf_projects_data.iterrows():
projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
Expand Down
52 changes: 52 additions & 0 deletions tests/unit_tests/parsers/family/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ def mock_family_doc_df():
"Type": "Adaptation",
},
],
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
]
)
Expand Down Expand Up @@ -94,6 +97,9 @@ def mock_family_row_ds():
"Type": "The Type for the Result Area",
},
],
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
)

Expand Down Expand Up @@ -137,6 +143,9 @@ def mock_family_row_no_result_areas():
"ResultAreas": [
{"Area": "", "Type": ""},
],
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
)

Expand Down Expand Up @@ -175,6 +184,49 @@ def mock_family_row_no_entities_no_regions():
"Type": "The Type for the Result Area",
},
],
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
)


@pytest.fixture()
def mock_family_row_with_non_int_budget_values():
yield pd.Series(
{
"ProjectsID": 3,
"ApprovedRef": "FP004",
"ProjectName": "Enhancing resilience of marine ecosystems",
"Theme": "Adaptation",
"Sector": "Private",
"ProjectURL": "https://www.climateaction.fund/project/FP004",
"Summary": "The Summary of the Project",
"Countries": [
{"Region": ""},
],
"Entities": [{"Name": ""}],
"Funding": [
{
"Source": "GCF",
"Budget": "82000",
"BudgetUSDeq": "82000",
},
{
"Source": "Co-Financing",
"Budget": 620000.20,
"BudgetUSDeq": 620000.50,
},
],
"ResultAreas": [
{
"Area": "The Area for the Result Area",
"Type": "The Type for the Result Area",
},
],
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
)

Expand Down
6 changes: 5 additions & 1 deletion tests/unit_tests/parsers/family/test_map_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def parsed_family_data():
"result_areas": ["Coastal protection and restoration"],
"result_types": ["Adaptation"],
"sector": ["Environment"],
"status": "Under Implementation",
"theme": ["Adaptation"],
},
"title": "Enhancing resilience of coastal ecosystems and communities",
Expand Down Expand Up @@ -63,11 +64,14 @@ def test_raises_error_on_validating_row_for_missing_columns():
"ResultAreas": [{"Area": "Coastal"}],
"Summary": "Fake Summary",
"ProjectName": "Fake Project Name",
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
]
)

expected_error_message = "Required fields ['Countries', 'Sector', 'Theme'] not present in df columns ['ApprovedRef', 'Entities', 'Funding', 'ProjectName', 'ProjectURL', 'ProjectsID', 'ResultAreas', 'Summary']"
expected_error_message = "Required fields ['Countries', 'Sector', 'Theme'] not present in df columns ['ApprovalDate', 'ApprovedRef', 'DateCompletion', 'Entities', 'Funding', 'ProjectName', 'ProjectURL', 'ProjectsID', 'ResultAreas', 'StartDate', 'Summary']"
with pytest.raises(AttributeError) as e:
family(test_data_frame, debug=True)
assert expected_error_message == str(e.value)
Expand Down
Loading

0 comments on commit b3348a2

Please sign in to comment.