Feature/pdct 1398 map family to new json structure (#14)

* feat: map family data create more modular methods for mapping family data, separate out metadata into its own function. * test: update tests * feat: add echo to map family data row tell the user we are skipping row if the metadata has missing information * test: update test and fixtures - separate the test files into mapping family data and meta data - add more fixtures to conftest to test diff outcomes, i.e where we have missing metadata information * Bump version to 0.1.11 * test: add test asserting output message when metadata is none * feat: add collections field to the family data being mapped --------- Co-authored-by: Osneil Drakes <[email protected]> Co-authored-by: Osneil Drakes <[email protected]>
climatepolicyradar · Sep 10, 2024 · 668f12e · 668f12e
1 parent 4665dd9
commit 668f12e
Show file tree

Hide file tree

Showing 6 changed files with 350 additions and 82 deletions.
diff --git a/gcf_data_mapper/enums/family.py b/gcf_data_mapper/enums/family.py
@@ -13,11 +13,14 @@ class FamilyColumnsNames(Enum):
     RESULT_AREAS = "ResultAreas"
     SECTOR = "Sector"
     THEME = "Theme"
+    TITLE = "ProjectName"
+    SUMMARY = "Summary"
 
 
 class FamilyNestedColumnNames(Enum):
     """The fields the GCF data mapper needs to parse nested family data/ metadata."""
 
+    COUNTRY_ISO3 = "ISO3"
     AREA = "Area"
     BUDGET = "BudgetUSDeq"
     NAME = "Name"

diff --git a/gcf_data_mapper/parsers/family.py b/gcf_data_mapper/parsers/family.py
@@ -98,8 +98,54 @@ def map_family_metadata(row: pd.Series) -> Optional[dict]:
     return metadata
 
 
+def map_family_data(
+    row: pd.Series,
+) -> Optional[dict]:
+    """Map the data of a family based on the provided row.
+
+    :param pd.Series row: The containing family and family metadata information.
+    :return Optional[dict]: A dictionary containing the mapped family data.
+    """
+
+    family_metadata = map_family_metadata(row)
+
+    # When processing the family metadata if there are any empty/falsy values we return None
+    # and skip the row. Therefore we don't want to process the rest of the family data so we
+    # return None in this conditional.
+    if family_metadata is None:
+        click.echo("🛑 Skipping row as family metadata has missing information")
+        return None
+
+    approved_ref = row.at[FamilyColumnsNames.APPROVED_REF.value]
+    projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
+    summary = row.at[FamilyColumnsNames.SUMMARY.value]
+    title = row.at[FamilyColumnsNames.TITLE.value]
+
+    geographies = [
+        country[FamilyNestedColumnNames.COUNTRY_ISO3.value]
+        for country in row.at[FamilyColumnsNames.COUNTRIES.value]
+    ]
+
+    import_id = f"GCF.family.{approved_ref}.{projects_id}"
+
+    family_data = {
+        # For now we are hard coding the category as MCF
+        "category": "MCF",
+        "collections": [],
+        "description": summary,
+        "geographies": geographies,
+        "import_id": import_id,
+        "metadata": family_metadata,
+        "title": title,
+    }
+
+    return family_data
+
+
 def process_row(
-    row: pd.Series, projects_id: str, required_columns: list[str]
+    row: pd.Series,
+    projects_id: str,
+    required_columns: list[str],
 ) -> Optional[dict]:
     """Map the family data based on the provided row.
 
@@ -121,13 +167,12 @@ def process_row(
         )
         return None
 
-    # TODO: Map family data
-    return {
-        "metadata": map_family_metadata(row),
-    }
+    return map_family_data(row)
 
 
-def family(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, Any]]]:
+def family(
+    gcf_projects_data: pd.DataFrame, debug: bool
+) -> list[Optional[dict[str, Any]]]:
     """Map the GCF family info to new structure.
 
     :param pd.DataFrame projects_data: The MCF and GCF project data,
@@ -144,9 +189,11 @@ def family(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str,
     mapped_families = []
 
     required_fields = set(str(e.value) for e in FamilyColumnsNames)
-    verify_required_fields_present(projects_data, required_fields)
 
-    for _, row in projects_data.iterrows():
+    verify_required_fields_present(gcf_projects_data, required_fields)
+    # Do a check that the projects data has the field you need
+
+    for _, row in gcf_projects_data.iterrows():
         projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value]
         mapped_families.append(process_row(row, projects_id, list(required_fields)))
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gcf-data-mapper"
-version = "0.1.10"
+version = "0.1.11"
 description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool."
 authors = ["CPR-dev-team <[email protected]>"]
 license = "Apache-2.0"

diff --git a/tests/unit_tests/parsers/family/conftest.py b/tests/unit_tests/parsers/family/conftest.py
@@ -5,7 +5,7 @@
 
 
 @pytest.fixture()
-def test_family_doc_df():
+def mock_family_doc_df():
     yield pd.DataFrame(
         [
             {
@@ -15,6 +15,7 @@ def test_family_doc_df():
                 "Theme": "Adaptation",
                 "Sector": "Environment",
                 "ProjectURL": "https://www.climateaction.fund/project/FP003",
+                "Summary": "The Summary of the Project",
                 "Countries": [
                     {
                         "CountryName": "Bangladesh",
@@ -51,6 +52,133 @@ def test_family_doc_df():
     )
 
 
+@pytest.fixture()
+def mock_family_row_ds():
+    yield pd.Series(
+        {
+            "ProjectsID": 1,
+            "ApprovedRef": "FP004",
+            "ProjectName": "Enhancing resilience of marine ecosystems",
+            "Theme": "Adaptation",
+            "Sector": "Private",
+            "ProjectURL": "https://www.climateaction.fund/project/FP004",
+            "Summary": "The Summary of the Project",
+            "Countries": [
+                {
+                    "CountryName": "Haiti",
+                    "ISO3": "HTI",
+                    "Region": "Latin America and the Caribbean",
+                },
+            ],
+            "Entities": [
+                {
+                    "Name": "Climate Action Innovations",
+                }
+            ],
+            "Funding": [
+                {
+                    "Source": "GCF",
+                    "Budget": 82000,
+                    "BudgetUSDeq": 82000,
+                },
+                {
+                    "ProjectBudgetID": 412,
+                    "Source": "Co-Financing",
+                    "Budget": 620000,
+                    "BudgetUSDeq": 620000,
+                },
+            ],
+            "ResultAreas": [
+                {
+                    "Area": "The Area for the Result Area",
+                    "Type": "The Type for the Result Area",
+                },
+            ],
+        }
+    )
+
+
+@pytest.fixture()
+def mock_family_row_no_result_areas():
+    yield pd.Series(
+        {
+            "ProjectsID": 2,
+            "ApprovedRef": "FP004",
+            "ProjectName": "Enhancing resilience of marine ecosystems",
+            "Theme": "Adaptation",
+            "Sector": "Private",
+            "ProjectURL": "https://www.climateaction.fund/project/FP004",
+            "Summary": "The Summary of the Project",
+            "Countries": [
+                {
+                    "CountryName": "Haiti",
+                    "ISO3": "HTI",
+                    "Region": "Latin America and the Caribbean",
+                },
+            ],
+            "Entities": [
+                {
+                    "Name": "Climate Action Innovations",
+                }
+            ],
+            "Funding": [
+                {
+                    "Source": "GCF",
+                    "Budget": 82000,
+                    "BudgetUSDeq": 82000,
+                },
+                {
+                    "ProjectBudgetID": 412,
+                    "Source": "Co-Financing",
+                    "Budget": 620000,
+                    "BudgetUSDeq": 620000,
+                },
+            ],
+            "ResultAreas": [
+                {"Area": "", "Type": ""},
+            ],
+        }
+    )
+
+
+@pytest.fixture()
+def mock_family_row_no_entities_no_regions():
+    yield pd.Series(
+        {
+            "ProjectsID": 3,
+            "ApprovedRef": "FP004",
+            "ProjectName": "Enhancing resilience of marine ecosystems",
+            "Theme": "Adaptation",
+            "Sector": "Private",
+            "ProjectURL": "https://www.climateaction.fund/project/FP004",
+            "Summary": "The Summary of the Project",
+            "Countries": [
+                {"Region": ""},
+            ],
+            "Entities": [{"Name": ""}],
+            "Funding": [
+                {
+                    "Source": "GCF",
+                    "Budget": 82000,
+                    "BudgetUSDeq": 82000,
+                },
+                {
+                    "ProjectBudgetID": 412,
+                    "Source": "Co-Financing",
+                    "Budget": 620000,
+                    "BudgetUSDeq": 620000,
+                },
+            ],
+            "ResultAreas": [
+                {
+                    "Area": "The Area for the Result Area",
+                    "Type": "The Type for the Result Area",
+                },
+            ],
+        }
+    )
+
+
 @pytest.fixture()
 def required_family_columns():
     required_columns = [column.value for column in FamilyColumnsNames]