diff --git a/gcf_data_mapper/parsers/document.py b/gcf_data_mapper/parsers/document.py index 12cdf0f..f5f27fb 100644 --- a/gcf_data_mapper/parsers/document.py +++ b/gcf_data_mapper/parsers/document.py @@ -1,3 +1,4 @@ +import os from typing import Any, Optional, cast from urllib.parse import urlparse @@ -17,6 +18,8 @@ verify_required_fields_present, ) +SUPPORTED_FILE_EXTENSIONS = [".pdf", ".html"] + def contains_duplicate_urls(urls: list[str]) -> bool: """Check a list of urls for any duplicate entries. @@ -195,6 +198,14 @@ def process_row(row: pd.Series, debug: bool) -> Optional[list[dict[str, Any]]]: click.echo(f"🛑 Skipping row with missing required document columns: {doc_id}") return None + source_url = row.at[RequiredDocumentColumns.SOURCE_URL.value] + _, ext = os.path.splitext(source_url) + if ext.lower() not in SUPPORTED_FILE_EXTENSIONS: + click.echo( + f"🛑 Skipping row as [{ext}] is not a valid file ext. Project ID: {doc_id}" + ) + return None + mapped_docs = [map_document_metadata(row, DocumentVariantNames.ORIGINAL.value)] if has_translated_files(row): translated_docs = map_translated_files(row) diff --git a/pyproject.toml b/pyproject.toml index 10cc078..b54332a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gcf-data-mapper" -version = "0.1.14" +version = "0.1.15" description = "A CLI tool to wrangle GCF data into format recognised by the bulk-import tool." authors = ["CPR-dev-team "] license = "Apache-2.0" diff --git a/tests/unit_tests/parsers/document/test_process_row.py b/tests/unit_tests/parsers/document/test_process_row.py index 864bf04..cc67b2a 100644 --- a/tests/unit_tests/parsers/document/test_process_row.py +++ b/tests/unit_tests/parsers/document/test_process_row.py @@ -89,3 +89,144 @@ def test_handles_data_with_leading_and_trailing_whitespace( ] assert expected_mapped_doc == process_row(mock_valid_row_with_whitespace, False) + + +@pytest.mark.parametrize( + ("test_ds,expected_return,error_message"), + [ + ( + pd.Series( + { + "ApprovedRef": "ref123", + "ProjectsID": "proj123", + "ID (Unique ID from our CMS for the document)": "doc123", + "Type": "Test type", + "Title": "Test title", + "Main file (English)": "link123.pdf", + "Document page permalink": "link123", + "Translated files": pd.NA, + "Translated titles": pd.NA, + } + ), + [ + { + "import_id": "GCF.document.ref123_proj123.doc123", + "family_import_id": "GCF.family.ref123.proj123", + "metadata": {"type": ["Test type"]}, + "title": "Test title", + "source_url": "link123.pdf", + "variant_name": "Original Language", + } + ], + None, + ), + ( + pd.Series( + { + "ApprovedRef": "ref123", + "ProjectsID": "proj123", + "ID (Unique ID from our CMS for the document)": "doc123", + "Type": "Test type", + "Title": "Test title", + "Main file (English)": "link123.PDF", + "Document page permalink": "link123", + "Translated files": pd.NA, + "Translated titles": pd.NA, + } + ), + [ + { + "import_id": "GCF.document.ref123_proj123.doc123", + "family_import_id": "GCF.family.ref123.proj123", + "metadata": {"type": ["Test type"]}, + "title": "Test title", + "source_url": "link123.PDF", + "variant_name": "Original Language", + } + ], + None, + ), + ( + pd.Series( + { + "ApprovedRef": "ref123", + "ProjectsID": "proj123", + "ID (Unique ID from our CMS for the document)": "doc123", + "Type": "Test type", + "Title": "Test title", + "Main file (English)": "link123.html", + "Document page permalink": "link123", + "Translated files": pd.NA, + "Translated titles": pd.NA, + } + ), + [ + { + "import_id": "GCF.document.ref123_proj123.doc123", + "family_import_id": "GCF.family.ref123.proj123", + "metadata": {"type": ["Test type"]}, + "title": "Test title", + "source_url": "link123.html", + "variant_name": "Original Language", + } + ], + None, + ), + ( + pd.Series( + { + "ApprovedRef": "ref123", + "ProjectsID": "proj123", + "ID (Unique ID from our CMS for the document)": "doc123", + "Type": "Test type", + "Title": "Test title", + "Main file (English)": "link123.HTML", + "Document page permalink": "link123", + "Translated files": pd.NA, + "Translated titles": pd.NA, + } + ), + [ + { + "import_id": "GCF.document.ref123_proj123.doc123", + "family_import_id": "GCF.family.ref123.proj123", + "metadata": {"type": ["Test type"]}, + "title": "Test title", + "source_url": "link123.HTML", + "variant_name": "Original Language", + } + ], + None, + ), + ( + pd.Series( + { + "ApprovedRef": "ref123", + "ProjectsID": "proj123", + "ID (Unique ID from our CMS for the document)": "doc123", + "Type": "Test type", + "Title": "Test title", + "Main file (English)": "link123.xlsx", + "Document page permalink": "link123", + "Translated files": pd.NA, + "Translated titles": pd.NA, + } + ), + None, + "🛑 Skipping row as [.xlsx] is not a valid file ext. Project ID: doc123", + ), + ], +) +def test_validates_url_has_a_supported_extension( + test_ds: pd.Series, + expected_return, + error_message: str, + capsys, +): + document_data = process_row(test_ds, debug=False) + + assert expected_return == document_data + + if error_message: + captured = capsys.readouterr() + assert error_message == captured.out.strip()