Skip to content

Commit

Permalink
Feature/pdct 1440 implement multi geographies in the navigator backend (
Browse files Browse the repository at this point in the history
#386)

* Update geo_subquery to support multiple geographies

* Put code in fenced block

* Update DB client to 3.8.19

* Implement multiple geographies

* Add multiple geo family to setup_helpers

* Parametrize tests to test single and multi geo

* Update comment with ticket number

* Update docstrings with types

* Add ticket number to comment

* Bump to 1.18.1

* Remove comment

* Split pipeline query into smaller queries for debug

* Add missing param and return type in docstring

* Fix line length on docstring

* Update stop & rm

* Revert changes to pipeline query

* Make pipeline query support multi geos (#388)

* Make pipeline query support multi geos

* Make ordering of db objects the same

* Update comment

* Make sure query is more equivalent to incumbent

* Create db_state_validator.py

* Create way of validating two db state files

* Revert "Create db_state_validator.py"

This reverts commit a03f693.

* Get most recent slug comments

* Break formatting of db_state contents into separate function

* Refactor generating db_state content into new function

* Updating the script to use sdk models. (#389)

Co-authored-by: Mark <[email protected]>

* Add test cases for checking db_state content

* Refactor write_documents_to_s3 for test isolation

---------

Co-authored-by: NextGenEng <[email protected]>
Co-authored-by: Mark <[email protected]>

* Bump to 1.19.0

* Fix test

* Create pipeline.sql

* Remove whitespace

* Move pipeline service functions into separate file

* Rename flatten_pipeline_metadata to _flatten_pipeline_metadata

* Remove service function

* Move pipeline repo functions under repository

* Update CODEOWNERS

* Remove unused code

* Move pipeline.sql uner sql folder

* Use raw SQL to execute pipeline query

* Add data_client as dependency to mock get_db

* Move pipeline tests under non_search

* Create test_flatten_pipeline_metadata.py

* Move file

* Driveby: Use os for path reference

* Add missing type hint

* Remove title casing

* Add return type hint

* Add missing param in docstring

* PR comment fixes

* Fix languages

* Use attribute accessor for series

* Move file

* Format family category as its always CAPS in db

* Format query

* Make casing consistent

* Make published_date timezone aware

---------

Co-authored-by: NextGenEng <[email protected]>
Co-authored-by: Mark <[email protected]>
  • Loading branch information
3 people authored Oct 22, 2024
1 parent a806001 commit 038163e
Show file tree
Hide file tree
Showing 25 changed files with 1,127 additions and 370 deletions.
4 changes: 3 additions & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
app/api/api_v1/routers/pipeline_trigger.py @climatepolicyradar/deng
tests/unit/app/core/test_pipeline.py @climatepolicyradar/deng
tests/non_search/app/test_pipeline_trigger.py @climatepolicyradar/deng
app/core/ingestion/ @climatepolicyradar/deng
app/repository/pipeline.py @climatepolicyradar/deng
app/service/pipeline.py @climatepolicyradar/deng
app/service/pipeline.sql @climatepolicyradar/deng
tests/search/search_fixtures/vespa_test_schema/ @climatepolicyradar/deng
app/api/api_v1/routers/search.py @climatepolicyradar/tech-devs
tests/search/ @climatepolicyradar/tech-devs
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -196,3 +196,5 @@ requirements.txt

# Vespa secrets directory
secrets/*

scripts/*.json
6 changes: 3 additions & 3 deletions app/api/api_v1/routers/pipeline_trigger.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from app.api.api_v1.schemas.document import BulkIngestResult
from app.clients.aws.client import S3Client, get_s3_client
from app.clients.db.session import get_db
from app.core.ingestion.pipeline import generate_pipeline_ingest_input
from app.core.validation.util import get_new_s3_prefix, write_documents_to_s3
from app.service.auth import get_superuser_details
from app.service.pipeline import get_db_state_content

_LOGGER = logging.getLogger(__name__)

Expand All @@ -22,11 +22,11 @@ def _start_ingest(
):
"""Writes a db state file to s3 which will trigger an ingest."""
try:
pipeline_ingest_input = generate_pipeline_ingest_input(db)
pipeline_ingest_input_content = get_db_state_content(db)
write_documents_to_s3(
s3_client=s3_client,
s3_prefix=s3_prefix,
documents=pipeline_ingest_input,
content=pipeline_ingest_input_content,
)
except Exception as e:
_LOGGER.exception(
Expand Down
8 changes: 5 additions & 3 deletions app/api/api_v1/schemas/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,12 @@ def _filter_climate_laws_url_from_source(cls, v):


class FamilyContext(BaseModel):
"""Used to given the family context when returning a FamilyDocument"""
"""Used to give the family context when returning a FamilyDocument"""

title: str
import_id: str
geography: str
geography: str # Keep this for backward compatibility PDCT-1440
geographies: list[str]
category: str
slug: str
published_date: Optional[datetime] = None
Expand All @@ -103,7 +104,8 @@ class FamilyAndDocumentsResponse(BaseModel):
import_id: str
title: str
summary: str
geography: str
geography: str # Keep this for backward compatibility PDCT-1440
geographies: list[str]
category: str
status: str
metadata: dict
Expand Down
3 changes: 2 additions & 1 deletion app/core/download.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Functions to support browsing the RDS document structure"""

import os
import zipfile
from functools import lru_cache
from io import BytesIO, StringIO
Expand All @@ -16,7 +17,7 @@

@lru_cache()
def _get_query_template():
with open("./app/core/download.sql", "r") as file:
with open(os.path.join("app", "core", "download.sql"), "r") as file:
return file.read()


Expand Down
86 changes: 0 additions & 86 deletions app/core/ingestion/match.py

This file was deleted.

143 changes: 0 additions & 143 deletions app/core/ingestion/pipeline.py

This file was deleted.

11 changes: 3 additions & 8 deletions app/core/validation/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from io import BytesIO
from typing import Any, Collection, Mapping, Optional, Sequence, Union

from app.api.api_v1.schemas.document import DocumentParserInput
from app.clients.aws.client import S3Client
from app.clients.aws.s3_document import S3Document
from app.config import INGEST_TRIGGER_ROOT, PIPELINE_BUCKET
Expand Down Expand Up @@ -40,22 +39,18 @@ def get_value(data_node: Mapping[str, str]) -> str:


def write_documents_to_s3(
s3_client: S3Client,
s3_prefix: str,
documents: Sequence[DocumentParserInput],
s3_client: S3Client, s3_prefix: str, content: dict[str, Any]
) -> Union[S3Document, bool]:
"""
Write current state of documents into S3 to trigger a pipeline run after bulk ingest
:param [S3Client] s3_client: an S3 client to use to write data
:param [str] s3_prefix: prefix into which to write the document updates in s3
:param [dict[str, Any]] content: db state document content
:param [Sequence[DocumentCreateRequest]] documents: a sequence of document
specifications to write to S3
"""
json_content = json.dumps(
{"documents": {d.import_id: d.to_json() for d in documents}},
indent=2,
)
json_content = json.dumps(content, indent=2)
bytes_content = BytesIO(json_content.encode("utf8"))
documents_object_key = f"{s3_prefix}/db_state.json"
_LOGGER.info("Writing Documents file into S3")
Expand Down
Loading

0 comments on commit 038163e

Please sign in to comment.