climatepolicyradar · odrakes-cpr · Sep 10, 2024 · Sep 10, 2024 · Sep 11, 2024 · Sep 11, 2024
@@ -1,8 +1,10 @@
 from typing import Any, Iterable, Optional
+from typing import Any, Iterable, Optional, Union
 
 import click
 import pandas as pd
 
+from gcf_data_mapper.enums.event import EventColumnNames, Events
 from gcf_data_mapper.enums.event import EventColumnNames, Events
 from gcf_data_mapper.enums.family import (
     FamilyColumnsNames,
@@ -58,12 +60,56 @@ def calculate_status(row: pd.Series) -> Optional[str]:
 
 
 def get_budgets(funding_list: list[dict], source: str) -> Optional[list[int]]:
+def contains_invalid_date_entries(list_of_dates: Iterable[pd.Timestamp]) -> bool:
+    """Check if any of the values in the list of dates are NaT (Not a Time).
+
+    :param Iterable[pd.Timestamp] list_of_dates: A list of pd.TimeStamps, may also include NoneTypes
+    :return bool: True if any of the values are not a valid timestamp. This helps distinguish between NaT and NaN/None Type values which are valid date entries.
+    """
+    return any(date is pd.NaT for date in list_of_dates)
+
+
+def calculate_status(row: pd.Series) -> Optional[str]:
+    """Calculate status of project based on the event types and dates
+        The status is calculated per the below:
+            Completed : (NOW is passed date-completion)
+            Under implementation : (NOW is passed start-date)
+            Approved : (NOW is passed approved-date)
+
+    :param pd.Series row: The row containing the event information
+    :return Optional[str]: The status of the project, if there are no valid values return None
+    """
+    completed_date = pd.to_datetime(row.at[Events.COMPLETED.column_name])
+    start_date = pd.to_datetime(row.at[Events.UNDER_IMPLEMENTATION.column_name])
+    approved_date = pd.to_datetime(row.at[Events.APPROVED.column_name])
+
+    if contains_invalid_date_entries([completed_date, start_date, approved_date]):
+        click.echo("- Row contains invalid date entries")
+        return None
+
+    now = pd.Timestamp.now(tz="UTC")
+
+    # This block is arranged to reflect the project lifecycle in reverse order, from the final stage to the initial stage.
+    if pd.notna(completed_date) and now >= completed_date:
+        return Events.COMPLETED.type
+    if pd.notna(start_date) and now >= start_date:
+        return Events.UNDER_IMPLEMENTATION.type
+    if pd.notna(approved_date) and now >= approved_date:
+        return Events.APPROVED.type
+
+    click.echo("- Row missing event date information to calculate status")
+    return None
+
+
+def get_budgets(
+    funding_list: list[dict], source: str
+) -> Optional[list[Union[int, float]]]:
     """Get the budget amount from the row based on the funding source.
 
     :param list[dict] row: A list of all the funding information, represented in dictionaries
     :param str source: The funding source to retrieve the budget from.
 
-    :return Optional[list[int]]: A list of budget amounts corresponding to the source,
+    :return Optional[list[Union[int, float]]]: A list of budget amounts corresponding to the source,
         or [0] if the source is not found.
     """
 
@@ -75,8 +121,8 @@ def get_budgets(funding_list: list[dict], source: str) -> Optional[list[int]]:
     ]
 
     # Check for any invalid values
-    if any(not isinstance(budget, (int)) for budget in budgets):
-        click.echo("🛑 Funding entries does not have valid int budget values")
+    if any(not isinstance(budget, (int, float)) for budget in budgets):
+        click.echo("- Funding entries does not have valid (int, float) budget values")
         return None
 
     # Where we have projects which have been solely funded by the fund (GCF), or solely co-financed
@@ -95,6 +141,11 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
 
     status = calculate_status(row)
 
+    if status is None:
+        return None
+
+    status = calculate_status(row)
+
     if status is None:
         return None
 
@@ -148,6 +199,7 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
         "result_types": list(set(types)),
         "sector": [row.at[FamilyColumnsNames.SECTOR.value]],
         "status": status,
+        "status": status,
         "theme": [row.at[FamilyColumnsNames.THEME.value]],
     }
 
@@ -164,16 +216,18 @@ def map_family_data(
     """
 
     family_metadata = map_family_metadata(row)
+    projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
 
     # When processing the family metadata if there are any empty/falsy values we return None
     # and skip the row. Therefore we don't want to process the rest of the family data so we
     # return None in this conditional.
     if family_metadata is None:
-        click.echo("🛑 Skipping row as family metadata has missing information")
+        click.echo(
+            f"🛑 Skipping row as family metadata has missing information, see ProjectsID : {projects_id}"
+        )
         return None
 
     approved_ref = row.at[FamilyColumnsNames.APPROVED_REF.value]
-    projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
     summary = row.at[FamilyColumnsNames.SUMMARY.value]
     title = row.at[FamilyColumnsNames.TITLE.value]
 
@@ -244,6 +298,8 @@ def family(
 
     mapped_families = []
 
+    family_columns = set(str(e.value) for e in FamilyColumnsNames)
+    required_fields = family_columns.union(set(str(e.value) for e in EventColumnNames))
     family_columns = set(str(e.value) for e in FamilyColumnsNames)
     required_fields = family_columns.union(set(str(e.value) for e in EventColumnNames))
 
@@ -254,6 +310,12 @@ def family(
     required_fields -= set(
         str(e.value) for e in EventColumnNames if str(e.value) not in family_columns
     )
+    # Whilst we expect the event columns to be present, some of the events in the data may have empty values.
+    # We therefore want to exclude these from the `row_contains_columns_with_empty_values` function,
+    # and handle any empty event values in the `calculate_status` function.
+    required_fields -= set(
+        str(e.value) for e in EventColumnNames if str(e.value) not in family_columns
+    )
 
     for _, row in gcf_projects_data.iterrows():
         projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]

@@ -50,6 +50,9 @@ def mock_family_doc_df():
                 "ApprovalDate": "2016-06-30T00:00:00.000Z",
                 "StartDate": "2024-06-28T00:00:00.000Z",
                 "DateCompletion": None,
+                "ApprovalDate": "2016-06-30T00:00:00.000Z",
+                "StartDate": "2024-06-28T00:00:00.000Z",
+                "DateCompletion": None,
             }
         ]
     )
@@ -100,6 +103,9 @@ def mock_family_row_ds():
             "ApprovalDate": "2016-06-30T00:00:00.000Z",
             "StartDate": "2024-06-28T00:00:00.000Z",
             "DateCompletion": None,
+            "ApprovalDate": "2016-06-30T00:00:00.000Z",
+            "StartDate": "2024-06-28T00:00:00.000Z",
+            "DateCompletion": None,
         }
     )
 
@@ -146,6 +152,9 @@ def mock_family_row_no_result_areas():
             "ApprovalDate": "2016-06-30T00:00:00.000Z",
             "StartDate": "2024-06-28T00:00:00.000Z",
             "DateCompletion": None,
+            "ApprovalDate": "2016-06-30T00:00:00.000Z",
+            "StartDate": "2024-06-28T00:00:00.000Z",
+            "DateCompletion": None,
         }
     )
 
@@ -192,7 +201,7 @@ def mock_family_row_no_entities_no_regions():
 
 
 @pytest.fixture()
-def mock_family_row_with_non_int_budget_values():
+def mock_family_row_with_non_int_non_float_budget_values():
     yield pd.Series(
         {
             "ProjectsID": 3,
@@ -214,8 +223,8 @@ def mock_family_row_with_non_int_budget_values():
                 },
                 {
                     "Source": "Co-Financing",
-                    "Budget": 620000.20,
-                    "BudgetUSDeq": 620000.50,
+                    "Budget": "620000.20",
+                    "BudgetUSDeq": "620000.50",
                 },
             ],
             "ResultAreas": [

@@ -152,15 +152,17 @@ def test_skips_processing_row_if_row_contains_empty_values(
 
 
 def test_skips_processing_row_if_family_metadata_has_missing_data(
-    mock_family_row_no_result_areas, capsys
+    mock_family_row_no_result_areas: pd.Series, capsys
 ):
+    projects_id = mock_family_row_no_result_areas.ProjectsID
+
     family_data = map_family_data(mock_family_row_no_result_areas)
     assert family_data is None
     captured = capsys.readouterr()
     # We have two outputs, one from map_family_metadata pointing to the missing data and the second
     # from map_family_data informing that the row is being skipped
     map_family_data_output = captured.out.strip().split("\n")
     assert (
-        "🛑 Skipping row as family metadata has missing information"
+        f"🛑 Skipping row as family metadata has missing information, see ProjectsID : {projects_id}"
         == map_family_data_output[1]
     )
@@ -1,5 +1,7 @@
 from typing import Optional
 
+from typing import Optional
+
 import pandas as pd
 import pytest
 
@@ -10,6 +12,13 @@
     get_budgets,
     map_family_metadata,
 )
+from gcf_data_mapper.enums.event import Events
+from gcf_data_mapper.parsers.family import (
+    calculate_status,
+    contains_invalid_date_entries,
+    get_budgets,
+    map_family_metadata,
+)
 
 
 @pytest.fixture()
@@ -129,9 +138,9 @@ def test_returns_expected_value_when_parsing_budget_data(
 
 
 def test_map_family_metadata_returns_none_if_budget_does_not_contain_valid_int_types(
-    mock_family_row_with_non_int_budget_values: pd.Series,
+    mock_family_row_with_non_int_non_float_budget_values: pd.Series,
 ):
-    result = map_family_metadata(mock_family_row_with_non_int_budget_values)
+    result = map_family_metadata(mock_family_row_with_non_int_non_float_budget_values)
     assert result is None
 
 
@@ -246,7 +255,7 @@ def test_dates_contain_invalid_date_entries(list_of_dates: list, return_value):
                     "DateCompletion": pd.NA,
                 }
             ),
-            "🛑 Row contains invalid date entries",
+            "- Row contains invalid date entries",
         ),
         (
             pd.Series(
@@ -256,7 +265,7 @@ def test_dates_contain_invalid_date_entries(list_of_dates: list, return_value):
                     "DateCompletion": "",
                 }
             ),
-            "🛑 Row contains invalid date entries",
+            "- Row contains invalid date entries",
         ),
         (
             pd.Series(
@@ -266,7 +275,7 @@ def test_dates_contain_invalid_date_entries(list_of_dates: list, return_value):
                     "DateCompletion": None,
                 }
             ),
-            "🛑 Row missing event date information to calculate status",
+            "- Row missing event date information to calculate status",
         ),
     ],
 )