feat: add function to calculate status of a project based on events (#15

) * feat: add function to calculate status of a project based on events * refactor: move nonetype budget check further up the chain in the block * refactor: update budget type checker to expect ints only --------- Co-authored-by: Osneil Drakes <[email protected]> Co-authored-by: Osneil Drakes <[email protected]>
climatepolicyradar · Sep 11, 2024 · b3348a2 · b3348a2
1 parent 668f12e
commit b3348a2
Show file tree

Hide file tree

Showing 4 changed files with 285 additions and 7 deletions.
diff --git a/gcf_data_mapper/parsers/family.py b/gcf_data_mapper/parsers/family.py
@@ -1,8 +1,9 @@
-from typing import Any, Optional
+from typing import Any, Iterable, Optional
 
 import click
 import pandas as pd
 
+from gcf_data_mapper.enums.event import EventColumnNames, Events
 from gcf_data_mapper.enums.family import (
     FamilyColumnsNames,
     FamilyNestedColumnNames,
@@ -15,13 +16,54 @@
 )
 
 
-def get_budgets(funding_list: list[dict], source: str) -> list[int]:
+def contains_invalid_date_entries(list_of_dates: Iterable[pd.Timestamp]) -> bool:
+    """Check if any of the values in the list of dates are NaT (Not a Time).
+
+    :param Iterable[pd.Timestamp] list_of_dates: A list of pd.TimeStamps, may also include NoneTypes
+    :return bool: True if any of the values are not a valid timestamp. This helps distinguish between NaT and NaN/None Type values which are valid date entries.
+    """
+    return any(date is pd.NaT for date in list_of_dates)
+
+
+def calculate_status(row: pd.Series) -> Optional[str]:
+    """Calculate status of project based on the event types and dates
+        The status is calculated per the below:
+            Completed : (NOW is passed date-completion)
+            Under implementation : (NOW is passed start-date)
+            Approved : (NOW is passed approved-date)
+
+    :param pd.Series row: The row containing the event information
+    :return Optional[str]: The status of the project, if there are no valid values return None
+    """
+    completed_date = pd.to_datetime(row.at[Events.COMPLETED.column_name])
+    start_date = pd.to_datetime(row.at[Events.UNDER_IMPLEMENTATION.column_name])
+    approved_date = pd.to_datetime(row.at[Events.APPROVED.column_name])
+
+    if contains_invalid_date_entries([completed_date, start_date, approved_date]):
+        click.echo("🛑 Row contains invalid date entries")
+        return None
+
+    now = pd.Timestamp.now(tz="UTC")
+
+    # This block is arranged to reflect the project lifecycle in reverse order, from the final stage to the initial stage.
+    if pd.notna(completed_date) and now >= completed_date:
+        return Events.COMPLETED.type
+    if pd.notna(start_date) and now >= start_date:
+        return Events.UNDER_IMPLEMENTATION.type
+    if pd.notna(approved_date) and now >= approved_date:
+        return Events.APPROVED.type
+
+    click.echo("🛑 Row missing event date information to calculate status")
+    return None
+
+
+def get_budgets(funding_list: list[dict], source: str) -> Optional[list[int]]:
     """Get the budget amount from the row based on the funding source.
 
     :param list[dict] row: A list of all the funding information, represented in dictionaries
     :param str source: The funding source to retrieve the budget from.
 
-    :return list[int]: A list of budget amounts corresponding to the source,
+    :return Optional[list[int]]: A list of budget amounts corresponding to the source,
         or [0] if the source is not found.
     """
 
@@ -32,6 +74,11 @@ def get_budgets(funding_list: list[dict], source: str) -> list[int]:
         funding[budget_key] for funding in funding_list if funding[source_key] == source
     ]
 
+    # Check for any invalid values
+    if any(not isinstance(budget, (int)) for budget in budgets):
+        click.echo("🛑 Funding entries does not have valid int budget values")
+        return None
+
     # Where we have projects which have been solely funded by the fund (GCF), or solely co-financed
     # - so in instances where there will be no funding that match either the GCF or co-financing
     # source value, we will map the `project_value_fund spend` or the `project_value_co_financing`
@@ -46,6 +93,11 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
     :return Optional[dict]: A dictionary containing mapped metadata for the family.
     """
 
+    status = calculate_status(row)
+
+    if status is None:
+        return None
+
     countries = row.at[FamilyColumnsNames.COUNTRIES.value]
     entities = row.at[FamilyColumnsNames.ENTITIES.value]
     funding_sources = row.at[FamilyColumnsNames.FUNDING.value]
@@ -61,6 +113,9 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
     )
     gcf_budgets = get_budgets(funding_sources, GCFProjectBudgetSource.GCF.value)
 
+    if gcf_budgets is None or co_financing_budgets is None:
+        return None
+
     implementing_agencies = [entity[name_key] for entity in entities]
     regions = [country[region_key] for country in countries]
     areas = [result[area_key] for result in result_areas]
@@ -92,6 +147,7 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
         "result_areas": list(set(areas)),
         "result_types": list(set(types)),
         "sector": [row.at[FamilyColumnsNames.SECTOR.value]],
+        "status": status,
         "theme": [row.at[FamilyColumnsNames.THEME.value]],
     }
 
@@ -188,10 +244,16 @@ def family(
 
     mapped_families = []
 
-    required_fields = set(str(e.value) for e in FamilyColumnsNames)
+    family_columns = set(str(e.value) for e in FamilyColumnsNames)
+    required_fields = family_columns.union(set(str(e.value) for e in EventColumnNames))
 
     verify_required_fields_present(gcf_projects_data, required_fields)
-    # Do a check that the projects data has the field you need
+    # Whilst we expect the event columns to be present, some of the events in the data may have empty values.
+    # We therefore want to exclude these from the `row_contains_columns_with_empty_values` function,
+    # and handle any empty event values in the `calculate_status` function.
+    required_fields -= set(
+        str(e.value) for e in EventColumnNames if str(e.value) not in family_columns
+    )
 
     for _, row in gcf_projects_data.iterrows():
         projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]

diff --git a/tests/unit_tests/parsers/family/conftest.py b/tests/unit_tests/parsers/family/conftest.py
@@ -47,6 +47,9 @@ def mock_family_doc_df():
                         "Type": "Adaptation",
                     },
                 ],
+                "ApprovalDate": "2016-06-30T00:00:00.000Z",
+                "StartDate": "2024-06-28T00:00:00.000Z",
+                "DateCompletion": None,
             }
         ]
     )
@@ -94,6 +97,9 @@ def mock_family_row_ds():
                     "Type": "The Type for the Result Area",
                 },
             ],
+            "ApprovalDate": "2016-06-30T00:00:00.000Z",
+            "StartDate": "2024-06-28T00:00:00.000Z",
+            "DateCompletion": None,
         }
     )
 
@@ -137,6 +143,9 @@ def mock_family_row_no_result_areas():
             "ResultAreas": [
                 {"Area": "", "Type": ""},
             ],
+            "ApprovalDate": "2016-06-30T00:00:00.000Z",
+            "StartDate": "2024-06-28T00:00:00.000Z",
+            "DateCompletion": None,
         }
     )
 
@@ -175,6 +184,49 @@ def mock_family_row_no_entities_no_regions():
                     "Type": "The Type for the Result Area",
                 },
             ],
+            "ApprovalDate": "2016-06-30T00:00:00.000Z",
+            "StartDate": "2024-06-28T00:00:00.000Z",
+            "DateCompletion": None,
+        }
+    )
+
+
+@pytest.fixture()
+def mock_family_row_with_non_int_budget_values():
+    yield pd.Series(
+        {
+            "ProjectsID": 3,
+            "ApprovedRef": "FP004",
+            "ProjectName": "Enhancing resilience of marine ecosystems",
+            "Theme": "Adaptation",
+            "Sector": "Private",
+            "ProjectURL": "https://www.climateaction.fund/project/FP004",
+            "Summary": "The Summary of the Project",
+            "Countries": [
+                {"Region": ""},
+            ],
+            "Entities": [{"Name": ""}],
+            "Funding": [
+                {
+                    "Source": "GCF",
+                    "Budget": "82000",
+                    "BudgetUSDeq": "82000",
+                },
+                {
+                    "Source": "Co-Financing",
+                    "Budget": 620000.20,
+                    "BudgetUSDeq": 620000.50,
+                },
+            ],
+            "ResultAreas": [
+                {
+                    "Area": "The Area for the Result Area",
+                    "Type": "The Type for the Result Area",
+                },
+            ],
+            "ApprovalDate": "2016-06-30T00:00:00.000Z",
+            "StartDate": "2024-06-28T00:00:00.000Z",
+            "DateCompletion": None,
         }
     )
 

diff --git a/tests/unit_tests/parsers/family/test_map_family.py b/tests/unit_tests/parsers/family/test_map_family.py
@@ -24,6 +24,7 @@ def parsed_family_data():
                 "result_areas": ["Coastal protection and restoration"],
                 "result_types": ["Adaptation"],
                 "sector": ["Environment"],
+                "status": "Under Implementation",
                 "theme": ["Adaptation"],
             },
             "title": "Enhancing resilience of coastal ecosystems and communities",
@@ -63,11 +64,14 @@ def test_raises_error_on_validating_row_for_missing_columns():
                 "ResultAreas": [{"Area": "Coastal"}],
                 "Summary": "Fake Summary",
                 "ProjectName": "Fake Project Name",
+                "ApprovalDate": "2016-06-30T00:00:00.000Z",
+                "StartDate": "2024-06-28T00:00:00.000Z",
+                "DateCompletion": None,
             }
         ]
     )
 
-    expected_error_message = "Required fields ['Countries', 'Sector', 'Theme'] not present in df columns ['ApprovedRef', 'Entities', 'Funding', 'ProjectName', 'ProjectURL', 'ProjectsID', 'ResultAreas', 'Summary']"
+    expected_error_message = "Required fields ['Countries', 'Sector', 'Theme'] not present in df columns ['ApprovalDate', 'ApprovedRef', 'DateCompletion', 'Entities', 'Funding', 'ProjectName', 'ProjectURL', 'ProjectsID', 'ResultAreas', 'StartDate', 'Summary']"
     with pytest.raises(AttributeError) as e:
         family(test_data_frame, debug=True)
     assert expected_error_message == str(e.value)