Feature/pdct 1440 implement multi geographies in the navigator backend (

#386) * Update geo_subquery to support multiple geographies * Put code in fenced block * Update DB client to 3.8.19 * Implement multiple geographies * Add multiple geo family to setup_helpers * Parametrize tests to test single and multi geo * Update comment with ticket number * Update docstrings with types * Add ticket number to comment * Bump to 1.18.1 * Remove comment * Split pipeline query into smaller queries for debug * Add missing param and return type in docstring * Fix line length on docstring * Update stop & rm * Revert changes to pipeline query * Make pipeline query support multi geos (#388) * Make pipeline query support multi geos * Make ordering of db objects the same * Update comment * Make sure query is more equivalent to incumbent * Create db_state_validator.py * Create way of validating two db state files * Revert "Create db_state_validator.py" This reverts commit a03f693. * Get most recent slug comments * Break formatting of db_state contents into separate function * Refactor generating db_state content into new function * Updating the script to use sdk models. (#389) Co-authored-by: Mark <[email protected]> * Add test cases for checking db_state content * Refactor write_documents_to_s3 for test isolation --------- Co-authored-by: NextGenEng <[email protected]> Co-authored-by: Mark <[email protected]> * Bump to 1.19.0 * Fix test * Create pipeline.sql * Remove whitespace * Move pipeline service functions into separate file * Rename flatten_pipeline_metadata to _flatten_pipeline_metadata * Remove service function * Move pipeline repo functions under repository * Update CODEOWNERS * Remove unused code * Move pipeline.sql uner sql folder * Use raw SQL to execute pipeline query * Add data_client as dependency to mock get_db * Move pipeline tests under non_search * Create test_flatten_pipeline_metadata.py * Move file * Driveby: Use os for path reference * Add missing type hint * Remove title casing * Add return type hint * Add missing param in docstring * PR comment fixes * Fix languages * Use attribute accessor for series * Move file * Format family category as its always CAPS in db * Format query * Make casing consistent * Make published_date timezone aware --------- Co-authored-by: NextGenEng <[email protected]> Co-authored-by: Mark <[email protected]>
climatepolicyradar · Oct 22, 2024 · 038163e · 038163e
1 parent a806001
commit 038163e
Show file tree

Hide file tree

Showing 25 changed files with 1,127 additions and 370 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -17,7 +17,9 @@
 app/api/api_v1/routers/pipeline_trigger.py       @climatepolicyradar/deng
 tests/unit/app/core/test_pipeline.py             @climatepolicyradar/deng
 tests/non_search/app/test_pipeline_trigger.py    @climatepolicyradar/deng
-app/core/ingestion/                              @climatepolicyradar/deng
+app/repository/pipeline.py                       @climatepolicyradar/deng
+app/service/pipeline.py                          @climatepolicyradar/deng
+app/service/pipeline.sql                         @climatepolicyradar/deng
 tests/search/search_fixtures/vespa_test_schema/  @climatepolicyradar/deng
 app/api/api_v1/routers/search.py                 @climatepolicyradar/tech-devs
 tests/search/                                    @climatepolicyradar/tech-devs

diff --git a/.gitignore b/.gitignore
@@ -196,3 +196,5 @@ requirements.txt
 
 # Vespa secrets directory
 secrets/*
+
+scripts/*.json
diff --git a/app/api/api_v1/routers/pipeline_trigger.py b/app/api/api_v1/routers/pipeline_trigger.py
@@ -6,9 +6,9 @@
 from app.api.api_v1.schemas.document import BulkIngestResult
 from app.clients.aws.client import S3Client, get_s3_client
 from app.clients.db.session import get_db
-from app.core.ingestion.pipeline import generate_pipeline_ingest_input
 from app.core.validation.util import get_new_s3_prefix, write_documents_to_s3
 from app.service.auth import get_superuser_details
+from app.service.pipeline import get_db_state_content
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -22,11 +22,11 @@ def _start_ingest(
 ):
     """Writes a db state file to s3 which will trigger an ingest."""
     try:
-        pipeline_ingest_input = generate_pipeline_ingest_input(db)
+        pipeline_ingest_input_content = get_db_state_content(db)
         write_documents_to_s3(
             s3_client=s3_client,
             s3_prefix=s3_prefix,
-            documents=pipeline_ingest_input,
+            content=pipeline_ingest_input_content,
         )
     except Exception as e:
         _LOGGER.exception(

diff --git a/app/api/api_v1/schemas/document.py b/app/api/api_v1/schemas/document.py
@@ -77,11 +77,12 @@ def _filter_climate_laws_url_from_source(cls, v):
 
 
 class FamilyContext(BaseModel):
-    """Used to given the family context when returning a FamilyDocument"""
+    """Used to give the family context when returning a FamilyDocument"""
 
     title: str
     import_id: str
-    geography: str
+    geography: str  # Keep this for backward compatibility PDCT-1440
+    geographies: list[str]
     category: str
     slug: str
     published_date: Optional[datetime] = None
@@ -103,7 +104,8 @@ class FamilyAndDocumentsResponse(BaseModel):
     import_id: str
     title: str
     summary: str
-    geography: str
+    geography: str  # Keep this for backward compatibility PDCT-1440
+    geographies: list[str]
     category: str
     status: str
     metadata: dict

diff --git a/app/core/download.py b/app/core/download.py
@@ -1,5 +1,6 @@
 """Functions to support browsing the RDS document structure"""
 
+import os
 import zipfile
 from functools import lru_cache
 from io import BytesIO, StringIO
@@ -16,7 +17,7 @@
 
 @lru_cache()
 def _get_query_template():
-    with open("./app/core/download.sql", "r") as file:
+    with open(os.path.join("app", "core", "download.sql"), "r") as file:
         return file.read()
 
 

diff --git a/app/core/ingestion/match.py b/app/core/ingestion/match.py
diff --git a/app/core/ingestion/pipeline.py b/app/core/ingestion/pipeline.py
diff --git a/app/core/validation/util.py b/app/core/validation/util.py
@@ -6,7 +6,6 @@
 from io import BytesIO
 from typing import Any, Collection, Mapping, Optional, Sequence, Union
 
-from app.api.api_v1.schemas.document import DocumentParserInput
 from app.clients.aws.client import S3Client
 from app.clients.aws.s3_document import S3Document
 from app.config import INGEST_TRIGGER_ROOT, PIPELINE_BUCKET
@@ -40,22 +39,18 @@ def get_value(data_node: Mapping[str, str]) -> str:
 
 
 def write_documents_to_s3(
-    s3_client: S3Client,
-    s3_prefix: str,
-    documents: Sequence[DocumentParserInput],
+    s3_client: S3Client, s3_prefix: str, content: dict[str, Any]
 ) -> Union[S3Document, bool]:
     """
     Write current state of documents into S3 to trigger a pipeline run after bulk ingest
 
     :param [S3Client] s3_client: an S3 client to use to write data
     :param [str] s3_prefix: prefix into which to write the document updates in s3
+    :param [dict[str, Any]] content: db state document content
     :param [Sequence[DocumentCreateRequest]] documents: a sequence of document
         specifications to write to S3
     """
-    json_content = json.dumps(
-        {"documents": {d.import_id: d.to_json() for d in documents}},
-        indent=2,
-    )
+    json_content = json.dumps(content, indent=2)
     bytes_content = BytesIO(json_content.encode("utf8"))
     documents_object_key = f"{s3_prefix}/db_state.json"
     _LOGGER.info("Writing Documents file into S3")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -196,3 +196,5 @@ requirements.txt

		# Vespa secrets directory
		secrets/*

		scripts/*.json