From f65de822d85c2b26d9f4cdefc182d222776e6e9c Mon Sep 17 00:00:00 2001 From: annaCPR Date: Mon, 11 Nov 2024 12:55:59 +0000 Subject: [PATCH] Strip whitespace from values (#28) * Strip whitespace from row values when mapping fmaily data * Strip whitespace from row values when mapping docs * Strip whitespace from row values when mapping events * Bump patch version --- .trunk/configs/cspell.json | 3 +- gcf_data_mapper/parsers/document.py | 3 ++ gcf_data_mapper/parsers/event.py | 5 +- gcf_data_mapper/parsers/family.py | 6 ++- gcf_data_mapper/parsers/helpers.py | 13 +++++ pyproject.toml | 2 +- tests/unit_tests/parsers/document/conftest.py | 16 ++++++ .../parsers/document/test_process_row.py | 18 +++++++ tests/unit_tests/parsers/event/test_event.py | 38 ++++++++++++++ tests/unit_tests/parsers/family/conftest.py | 49 +++++++++++++++++++ .../parsers/family/test_map_family.py | 32 ++++++++++++ 11 files changed, 179 insertions(+), 6 deletions(-) diff --git a/.trunk/configs/cspell.json b/.trunk/configs/cspell.json index 9667033..fa06ea8 100644 --- a/.trunk/configs/cspell.json +++ b/.trunk/configs/cspell.json @@ -41,7 +41,8 @@ "isin", "pydantic", "getfixturevalue", - "isna" + "isna", + "AAABBB" ], "flagWords": ["hte"], "suggestionsTimeout": 5000 diff --git a/gcf_data_mapper/parsers/document.py b/gcf_data_mapper/parsers/document.py index 0b31152..12cdf0f 100644 --- a/gcf_data_mapper/parsers/document.py +++ b/gcf_data_mapper/parsers/document.py @@ -13,6 +13,7 @@ ) from gcf_data_mapper.parsers.helpers import ( check_required_column_value_not_na, + strip_nested, verify_required_fields_present, ) @@ -177,6 +178,8 @@ def process_row(row: pd.Series, debug: bool) -> Optional[list[dict[str, Any]]]: the 'destination' format described in the GCF Data Mapper Google Sheet. """ + row = cast(pd.Series, row.apply(strip_nested)) + doc_id = ( row.at[RequiredDocumentColumns.ID.value] if RequiredDocumentColumns.ID.value in row.index diff --git a/gcf_data_mapper/parsers/event.py b/gcf_data_mapper/parsers/event.py index 2b4cd5c..d2b3db4 100644 --- a/gcf_data_mapper/parsers/event.py +++ b/gcf_data_mapper/parsers/event.py @@ -1,10 +1,10 @@ -from typing import Any, Optional +from typing import Any, Optional, cast import click import pandas as pd from gcf_data_mapper.enums.event import Event, EventColumnNames, Events -from gcf_data_mapper.parsers.helpers import verify_required_fields_present +from gcf_data_mapper.parsers.helpers import strip_nested, verify_required_fields_present def append_event( @@ -134,6 +134,7 @@ def event(projects_data: pd.DataFrame, debug: bool) -> list[Optional[dict[str, A event_counter = {} for _, row in projects_data.iterrows(): + row = cast(pd.Series, row.apply(strip_nested)) approved_ref = row.at[EventColumnNames.APPROVED_REF.value] projects_id = row.at[EventColumnNames.PROJECTS_ID.value] process_event(row, gcf_events, event_counter, approved_ref, projects_id) diff --git a/gcf_data_mapper/parsers/family.py b/gcf_data_mapper/parsers/family.py index e88d94a..c68e565 100644 --- a/gcf_data_mapper/parsers/family.py +++ b/gcf_data_mapper/parsers/family.py @@ -1,4 +1,4 @@ -from typing import Any, Iterable, Optional +from typing import Any, Iterable, Optional, cast import click import pandas as pd @@ -12,6 +12,7 @@ from gcf_data_mapper.parsers.helpers import ( arrays_contain_empty_values, row_contains_columns_with_empty_values, + strip_nested, verify_required_fields_present, ) @@ -222,6 +223,7 @@ def process_row( ) return None + row = cast(pd.Series, row.apply(strip_nested)) return map_family_data(row) @@ -255,7 +257,7 @@ def family( ) for _, row in gcf_projects_data.iterrows(): - projects_id = row.at[FamilyColumnsNames.PROJECTS_ID.value] + projects_id = str(row.at[FamilyColumnsNames.PROJECTS_ID.value]).strip() mapped_families.append(process_row(row, projects_id, list(required_fields))) return mapped_families diff --git a/gcf_data_mapper/parsers/helpers.py b/gcf_data_mapper/parsers/helpers.py index 1f1f471..539f1a7 100644 --- a/gcf_data_mapper/parsers/helpers.py +++ b/gcf_data_mapper/parsers/helpers.py @@ -1,3 +1,5 @@ +from typing import Any + import click import pandas as pd @@ -69,3 +71,14 @@ def arrays_contain_empty_values(list_values: list[tuple], id: str) -> bool: return True return False + + +def strip_nested(value: Any) -> Any: + """Recursively strip strings in nested structures.""" + if isinstance(value, str): + return value.strip() + elif isinstance(value, list): + return [strip_nested(item) for item in value] + elif isinstance(value, dict): + return {key: strip_nested(val) for key, val in value.items()} + return value diff --git a/pyproject.toml b/pyproject.toml index 332d46a..10cc078 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gcf-data-mapper" -version = "0.1.13" +version = "0.1.14" description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool." authors = ["CPR-dev-team "] license = "Apache-2.0" diff --git a/tests/unit_tests/parsers/document/conftest.py b/tests/unit_tests/parsers/document/conftest.py index 5597745..552ac9f 100644 --- a/tests/unit_tests/parsers/document/conftest.py +++ b/tests/unit_tests/parsers/document/conftest.py @@ -97,6 +97,22 @@ def mock_valid_row(): ) +@pytest.fixture +def mock_valid_row_with_whitespace(): + return pd.Series( + { + "ApprovedRef": " ref123 ", + "ProjectsID": " proj123 ", + "ID (Unique ID from our CMS for the document)": " doc123 ", + "Type": " type123 ", + "Title": " title123 ", + "Main file (English)": " link123.pdf ", + "Document page permalink": " link123 ", + "Translated titles": None, + } + ) + + @pytest.fixture def mock_gcf_docs(): return pd.DataFrame( diff --git a/tests/unit_tests/parsers/document/test_process_row.py b/tests/unit_tests/parsers/document/test_process_row.py index 80afe08..864bf04 100644 --- a/tests/unit_tests/parsers/document/test_process_row.py +++ b/tests/unit_tests/parsers/document/test_process_row.py @@ -71,3 +71,21 @@ def test_process_row_returns_none_with_na_in_required_columns( assert process_row(row, debug=False) is None captured = capsys.readouterr() assert expected_error_msg in captured.out + + +def test_handles_data_with_leading_and_trailing_whitespace( + mock_valid_row_with_whitespace, +): + + expected_mapped_doc = [ + { + "import_id": "GCF.document.ref123_proj123.doc123", + "family_import_id": "GCF.family.ref123.proj123", + "metadata": {"type": ["type123"]}, + "title": "title123", + "source_url": "link123.pdf", + "variant_name": "Original Language", + } + ] + + assert expected_mapped_doc == process_row(mock_valid_row_with_whitespace, False) diff --git a/tests/unit_tests/parsers/event/test_event.py b/tests/unit_tests/parsers/event/test_event.py index 072f360..968a67d 100644 --- a/tests/unit_tests/parsers/event/test_event.py +++ b/tests/unit_tests/parsers/event/test_event.py @@ -61,3 +61,41 @@ def test_event_handles_partial_valid_dates(): ) result = event(projects_data, debug=False) assert len(result) == 3 + + +def test_handles_data_with_leading_and_trailing_whitespace(): + mock_projects_data = pd.DataFrame( + { + "ApprovalDate": [" 2023-01-01 ", None], + "StartDate": [None, " 2023-06-01"], + "DateCompletion": ["2023-12-31 ", None], + "ApprovedRef": [" FP123 ", " FP124 "], + "ProjectsID": [" PID456 ", " PID457 "], + } + ) + + expected_mapped_events = [ + { + "date": "2023-01-01", + "event_title": "Project Approved", + "event_type_value": "Project Approved", + "import_id": "GCF.event.FP123_PID456.n0000", + "family_import_id": "GCF.family.FP123.PID456", + }, + { + "date": "2023-12-31", + "event_title": "Project Completed", + "event_type_value": "Project Completed", + "family_import_id": "GCF.family.FP123.PID456", + "import_id": "GCF.event.FP123_PID456.n0001", + }, + { + "date": "2023-06-01", + "event_title": "Under Implementation", + "event_type_value": "Under Implementation", + "family_import_id": "GCF.family.FP124.PID457", + "import_id": "GCF.event.FP124_PID457.n0000", + }, + ] + + assert expected_mapped_events == event(mock_projects_data, False) diff --git a/tests/unit_tests/parsers/family/conftest.py b/tests/unit_tests/parsers/family/conftest.py index 4f34a79..b396243 100644 --- a/tests/unit_tests/parsers/family/conftest.py +++ b/tests/unit_tests/parsers/family/conftest.py @@ -231,6 +231,55 @@ def mock_family_row_with_non_int_non_float_budget_values(): ) +@pytest.fixture() +def mock_family_doc_with_whitespace(): + yield pd.Series( + { + "ProjectsID": " AAABBB ", + "ApprovedRef": " FP003 ", + "ProjectName": " Enhancing resilience of coastal ecosystems and communities", + "Theme": " Adaptation ", + "Sector": " Environment ", + "ProjectURL": " https://www.climateaction.fund/project/FP003 ", + "Summary": " The Summary of the Project ", + "Countries": [ + { + "CountryName": " Bangladesh ", + "ISO3": " BGD ", + "Region": " Asia ", + }, + ], + "Entities": [ + { + "Name": " Green Innovations ", + } + ], + "Funding": [ + { + "Source": " GCF ", + "Budget": 9200000, + "BudgetUSDeq": 9200000, + }, + { + "ProjectBudgetID": 412, + "Source": " Co-Financing ", + "Budget": 620000, + "BudgetUSDeq": 620000, + }, + ], + "ResultAreas": [ + { + "Area": " Coastal protection and restoration ", + "Type": " Adaptation ", + }, + ], + "ApprovalDate": " 2016-06-30T00:00:00.000Z ", + "StartDate": " 2024-06-28T00:00:00.000Z ", + "DateCompletion": None, + } + ) + + @pytest.fixture() def required_family_columns(): required_columns = [column.value for column in FamilyColumnsNames] diff --git a/tests/unit_tests/parsers/family/test_map_family.py b/tests/unit_tests/parsers/family/test_map_family.py index b7fc1c9..803e94a 100644 --- a/tests/unit_tests/parsers/family/test_map_family.py +++ b/tests/unit_tests/parsers/family/test_map_family.py @@ -165,3 +165,35 @@ def test_skips_processing_row_if_family_metadata_has_missing_data( f"🛑 Skipping row as family metadata has missing information, ProjectsID : {projects_id}" == map_family_data_output[1] ) + + +def test_handles_data_with_leading_and_trailing_whitespace( + mock_family_doc_with_whitespace, +): + + expected_mapped_family = { + "category": "MCF", + "collections": [], + "summary": "The Summary of the Project", + "geographies": ["BGD"], + "import_id": "GCF.family.FP003.AAABBB", + "metadata": { + "approved_ref": ["FP003"], + "implementing_agency": ["Green Innovations"], + "project_id": ["AAABBB"], + "project_url": ["https://www.climateaction.fund/project/FP003"], + "project_value_fund_spend": ["9200000"], + "project_value_co_financing": ["620000"], + "region": ["Asia"], + "result_area": ["Coastal protection and restoration"], + "result_type": ["Adaptation"], + "sector": ["Environment"], + "status": ["Under Implementation"], + "theme": ["Adaptation"], + }, + "title": "Enhancing resilience of coastal ecosystems and communities", + } + + assert expected_mapped_family == process_row( + mock_family_doc_with_whitespace, " AAABBB ", [] + )