Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add function to calculate status of a project based on events #15

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 67 additions & 5 deletions gcf_data_mapper/parsers/family.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import Any, Optional
from typing import Any, Iterable, Optional

import click
import pandas as pd

from gcf_data_mapper.enums.event import EventColumnNames, Events
from gcf_data_mapper.enums.family import (
FamilyColumnsNames,
FamilyNestedColumnNames,
Expand All @@ -15,13 +16,54 @@
)


def get_budgets(funding_list: list[dict], source: str) -> list[int]:
def contains_invalid_date_entries(list_of_dates: Iterable[pd.Timestamp]) -> bool:
"""Check if any of the values in the list of dates are NaT (Not a Time).

:param Iterable[pd.Timestamp] list_of_dates: A list of pd.TimeStamps, may also include NoneTypes
:return bool: True if any of the values are not a valid timestamp. This helps distinguish between NaT and NaN/None Type values which are valid date entries.
"""
return any(date is pd.NaT for date in list_of_dates)


def calculate_status(row: pd.Series) -> Optional[str]:
"""Calculate status of project based on the event types and dates
The status is calculated per the below:
Completed : (NOW is passed date-completion)
Under implementation : (NOW is passed start-date)
Approved : (NOW is passed approved-date)

:param pd.Series row: The row containing the event information
:return Optional[str]: The status of the project, if there are no valid values return None
"""
completed_date = pd.to_datetime(row.at[Events.COMPLETED.column_name])
start_date = pd.to_datetime(row.at[Events.UNDER_IMPLEMENTATION.column_name])
approved_date = pd.to_datetime(row.at[Events.APPROVED.column_name])

if contains_invalid_date_entries([completed_date, start_date, approved_date]):
click.echo("🛑 Row contains invalid date entries")
return None

now = pd.Timestamp.now(tz="UTC")

# This block is arranged to reflect the project lifecycle in reverse order, from the final stage to the initial stage.
if pd.notna(completed_date) and now >= completed_date:
return Events.COMPLETED.type
if pd.notna(start_date) and now >= start_date:
return Events.UNDER_IMPLEMENTATION.type
if pd.notna(approved_date) and now >= approved_date:
return Events.APPROVED.type

click.echo("🛑 Row missing event date information to calculate status")
return None


def get_budgets(funding_list: list[dict], source: str) -> Optional[list[int]]:
"""Get the budget amount from the row based on the funding source.

:param list[dict] row: A list of all the funding information, represented in dictionaries
:param str source: The funding source to retrieve the budget from.

:return list[int]: A list of budget amounts corresponding to the source,
:return Optional[list[int]]: A list of budget amounts corresponding to the source,
or [0] if the source is not found.
"""

Expand All @@ -32,6 +74,11 @@ def get_budgets(funding_list: list[dict], source: str) -> list[int]:
funding[budget_key] for funding in funding_list if funding[source_key] == source
]

# Check for any invalid values
if any(not isinstance(budget, (int)) for budget in budgets):
click.echo("🛑 Funding entries does not have valid int budget values")
return None

# Where we have projects which have been solely funded by the fund (GCF), or solely co-financed
# - so in instances where there will be no funding that match either the GCF or co-financing
# source value, we will map the `project_value_fund spend` or the `project_value_co_financing`
Expand All @@ -46,6 +93,11 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
:return Optional[dict]: A dictionary containing mapped metadata for the family.
"""

status = calculate_status(row)

if status is None:
return None

countries = row.at[FamilyColumnsNames.COUNTRIES.value]
entities = row.at[FamilyColumnsNames.ENTITIES.value]
funding_sources = row.at[FamilyColumnsNames.FUNDING.value]
Expand All @@ -61,6 +113,9 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
)
gcf_budgets = get_budgets(funding_sources, GCFProjectBudgetSource.GCF.value)

if gcf_budgets is None or co_financing_budgets is None:
return None

implementing_agencies = [entity[name_key] for entity in entities]
regions = [country[region_key] for country in countries]
areas = [result[area_key] for result in result_areas]
Expand Down Expand Up @@ -92,6 +147,7 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
"result_areas": list(set(areas)),
"result_types": list(set(types)),
"sector": [row.at[FamilyColumnsNames.SECTOR.value]],
"status": status,
"theme": [row.at[FamilyColumnsNames.THEME.value]],
}

Expand Down Expand Up @@ -188,10 +244,16 @@ def family(

mapped_families = []

required_fields = set(str(e.value) for e in FamilyColumnsNames)
family_columns = set(str(e.value) for e in FamilyColumnsNames)
required_fields = family_columns.union(set(str(e.value) for e in EventColumnNames))

verify_required_fields_present(gcf_projects_data, required_fields)
# Do a check that the projects data has the field you need
# Whilst we expect the event columns to be present, some of the events in the data may have empty values.
# We therefore want to exclude these from the `row_contains_columns_with_empty_values` function,
# and handle any empty event values in the `calculate_status` function.
required_fields -= set(
str(e.value) for e in EventColumnNames if str(e.value) not in family_columns
)

for _, row in gcf_projects_data.iterrows():
projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
Expand Down
52 changes: 52 additions & 0 deletions tests/unit_tests/parsers/family/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ def mock_family_doc_df():
"Type": "Adaptation",
},
],
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
]
)
Expand Down Expand Up @@ -94,6 +97,9 @@ def mock_family_row_ds():
"Type": "The Type for the Result Area",
},
],
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
)

Expand Down Expand Up @@ -137,6 +143,9 @@ def mock_family_row_no_result_areas():
"ResultAreas": [
{"Area": "", "Type": ""},
],
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
)

Expand Down Expand Up @@ -175,6 +184,49 @@ def mock_family_row_no_entities_no_regions():
"Type": "The Type for the Result Area",
},
],
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
)


@pytest.fixture()
def mock_family_row_with_non_int_budget_values():
yield pd.Series(
{
"ProjectsID": 3,
"ApprovedRef": "FP004",
"ProjectName": "Enhancing resilience of marine ecosystems",
"Theme": "Adaptation",
"Sector": "Private",
"ProjectURL": "https://www.climateaction.fund/project/FP004",
"Summary": "The Summary of the Project",
"Countries": [
{"Region": ""},
],
"Entities": [{"Name": ""}],
"Funding": [
{
"Source": "GCF",
"Budget": "82000",
"BudgetUSDeq": "82000",
},
{
"Source": "Co-Financing",
"Budget": 620000.20,
"BudgetUSDeq": 620000.50,
},
],
"ResultAreas": [
{
"Area": "The Area for the Result Area",
"Type": "The Type for the Result Area",
},
],
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
)

Expand Down
6 changes: 5 additions & 1 deletion tests/unit_tests/parsers/family/test_map_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def parsed_family_data():
"result_areas": ["Coastal protection and restoration"],
"result_types": ["Adaptation"],
"sector": ["Environment"],
"status": "Under Implementation",
"theme": ["Adaptation"],
},
"title": "Enhancing resilience of coastal ecosystems and communities",
Expand Down Expand Up @@ -63,11 +64,14 @@ def test_raises_error_on_validating_row_for_missing_columns():
"ResultAreas": [{"Area": "Coastal"}],
"Summary": "Fake Summary",
"ProjectName": "Fake Project Name",
"ApprovalDate": "2016-06-30T00:00:00.000Z",
"StartDate": "2024-06-28T00:00:00.000Z",
"DateCompletion": None,
}
]
)

expected_error_message = "Required fields ['Countries', 'Sector', 'Theme'] not present in df columns ['ApprovedRef', 'Entities', 'Funding', 'ProjectName', 'ProjectURL', 'ProjectsID', 'ResultAreas', 'Summary']"
expected_error_message = "Required fields ['Countries', 'Sector', 'Theme'] not present in df columns ['ApprovalDate', 'ApprovedRef', 'DateCompletion', 'Entities', 'Funding', 'ProjectName', 'ProjectURL', 'ProjectsID', 'ResultAreas', 'StartDate', 'Summary']"
with pytest.raises(AttributeError) as e:
family(test_data_frame, debug=True)
assert expected_error_message == str(e.value)
Expand Down
Loading
Loading