diff --git a/app/api/api_v1/routers/__init__.py b/app/api/api_v1/routers/__init__.py index 58e4585e..94c9f748 100644 --- a/app/api/api_v1/routers/__init__.py +++ b/app/api/api_v1/routers/__init__.py @@ -1,12 +1,12 @@ from app.api.api_v1.routers.analytics import analytics_router from app.api.api_v1.routers.auth import auth_router +from app.api.api_v1.routers.bulk_import import bulk_import_router from app.api.api_v1.routers.collection import collections_router from app.api.api_v1.routers.config import config_router from app.api.api_v1.routers.corpus import corpora_router from app.api.api_v1.routers.document import document_router from app.api.api_v1.routers.event import event_router from app.api.api_v1.routers.family import families_router -from app.api.api_v1.routers.ingest import ingest_router __all__ = ( "analytics_router", @@ -17,5 +17,5 @@ "document_router", "event_router", "families_router", - "ingest_router", + "bulk_import_router", ) diff --git a/app/api/api_v1/routers/ingest.py b/app/api/api_v1/routers/bulk_import.py similarity index 75% rename from app/api/api_v1/routers/ingest.py rename to app/api/api_v1/routers/bulk_import.py index b39829ae..e8f0b439 100644 --- a/app/api/api_v1/routers/ingest.py +++ b/app/api/api_v1/routers/bulk_import.py @@ -12,31 +12,31 @@ from app.errors import ValidationError from app.model.general import Json -from app.service.ingest import ( +from app.service.bulk_import import ( get_collection_template, get_document_template, get_event_template, get_family_template, import_data, ) -from app.service.validation import validate_ingest_data +from app.service.validation import validate_bulk_import_data -ingest_router = r = APIRouter() +bulk_import_router = r = APIRouter() _LOGGER = logging.getLogger(__name__) @r.get( - "/ingest/template/{corpus_type}", + "/bulk-import/template/{corpus_type}", response_model=Json, status_code=status.HTTP_200_OK, ) -async def get_ingest_template(corpus_type: str) -> Json: +async def get_bulk_import_template(corpus_type: str) -> Json: """ - Data ingest template endpoint. + Data bulk import template endpoint. - :param str corpus_type: type of the corpus of data to ingest. - :return Json: json representation of ingest template. + :param str corpus_type: type of the corpus of data to import. + :return Json: json representation of bulk import template. """ _LOGGER.info(f"Creating template for corpus type: {corpus_type}") @@ -54,30 +54,30 @@ async def get_ingest_template(corpus_type: str) -> Json: @r.post( - "/ingest/{corpus_import_id}", + "/bulk-import/{corpus_import_id}", response_model=Json, status_code=status.HTTP_202_ACCEPTED, ) -async def ingest( +async def bulk_import( request: Request, - new_data: UploadFile, + data: UploadFile, corpus_import_id: str, background_tasks: BackgroundTasks, ) -> Json: """ Bulk import endpoint. - :param UploadFile new_data: file containing json representation of data to ingest. - :return Json: json representation of the data to ingest. + :param UploadFile new_data: file containing json representation of data to import. + :return Json: json representation of the data to import. """ _LOGGER.info( f"User {request.state.user} triggered bulk import for corpus: {corpus_import_id}" ) try: - content = await new_data.read() + content = await data.read() data_dict = json.loads(content) - validate_ingest_data(data_dict) + validate_bulk_import_data(data_dict) background_tasks.add_task(import_data, data_dict, corpus_import_id) diff --git a/app/clients/aws/s3bucket.py b/app/clients/aws/s3bucket.py index 65b49568..13310024 100644 --- a/app/clients/aws/s3bucket.py +++ b/app/clients/aws/s3bucket.py @@ -106,25 +106,25 @@ def upload_json_to_s3( raise -def upload_ingest_json_to_s3( - ingest_id: str, corpus_import_id: str, data: dict[str, Any] +def upload_bulk_import_json_to_s3( + import_id: str, corpus_import_id: str, data: dict[str, Any] ) -> None: """ - Upload an ingest JSON file to S3 + Upload an bulk import JSON file to S3 - :param str ingest_id: The uuid of the ingest action. - :param str corpus_import_id: The import_id of the corpus the ingest data belongs to. - :param dict[str, Any] json_data: The ingest json data to be uploaded to S3. + :param str import_id: The uuid of the bulk import action. + :param str corpus_import_id: The id of the corpus the bulk import data belongs to. + :param dict[str, Any] json_data: The bulk import json data to be uploaded to S3. """ - ingest_upload_bucket = os.environ["BULK_IMPORT_BUCKET"] + bulk_import_upload_bucket = os.environ["BULK_IMPORT_BUCKET"] current_timestamp = datetime.now().strftime("%m-%d-%YT%H:%M:%S") - filename = f"{ingest_id}-{corpus_import_id}-{current_timestamp}.json" + filename = f"{import_id}-{corpus_import_id}-{current_timestamp}.json" s3_client = boto3.client("s3") context = S3UploadContext( - bucket_name=ingest_upload_bucket, + bucket_name=bulk_import_upload_bucket, object_name=filename, ) upload_json_to_s3(s3_client, context, data) diff --git a/app/main.py b/app/main.py index d6b3dfe8..46bd5f1d 100644 --- a/app/main.py +++ b/app/main.py @@ -17,13 +17,13 @@ from app.api.api_v1.routers import ( analytics_router, auth_router, + bulk_import_router, collections_router, config_router, corpora_router, document_router, event_router, families_router, - ingest_router, ) from app.api.api_v1.routers.auth import check_user_auth from app.clients.db.session import engine @@ -97,9 +97,9 @@ async def lifespan(app_: FastAPI): ) app.include_router( - ingest_router, + bulk_import_router, prefix="/api/v1", - tags=["ingest"], + tags=["bulk-import"], dependencies=[Depends(check_user_auth)], ) diff --git a/app/model/authorisation.py b/app/model/authorisation.py index 0db7a1d4..5d542694 100644 --- a/app/model/authorisation.py +++ b/app/model/authorisation.py @@ -18,7 +18,7 @@ class AuthEndpoint(str, enum.Enum): CONFIG = "CONFIG" ANALYTICS = "ANALYTICS" EVENT = "EVENTS" - INGEST = "INGEST" + BULK_IMPORT = "BULK-IMPORT" CORPUS = "CORPORA" @@ -61,8 +61,8 @@ class AuthEndpoint(str, enum.Enum): AuthOperation.UPDATE: AuthAccess.USER, AuthOperation.DELETE: AuthAccess.USER, }, - # Ingest - AuthEndpoint.INGEST: { + # Bulk Import + AuthEndpoint.BULK_IMPORT: { AuthOperation.CREATE: AuthAccess.SUPER, AuthOperation.READ: AuthAccess.SUPER, }, diff --git a/app/model/ingest.py b/app/model/bulk_import.py similarity index 72% rename from app/model/ingest.py rename to app/model/bulk_import.py index 830871ae..f1bce8c0 100644 --- a/app/model/ingest.py +++ b/app/model/bulk_import.py @@ -1,17 +1,18 @@ from datetime import datetime -from typing import Optional +from typing import Dict, List, Optional, Union -from pydantic import AnyHttpUrl, BaseModel +from pydantic import AnyHttpUrl, BaseModel, RootModel from app.model.collection import CollectionCreateDTO from app.model.document import DocumentCreateDTO from app.model.event import EventCreateDTO from app.model.family import FamilyCreateDTO -from app.model.general import Json +Metadata = RootModel[Dict[str, Union[str, List[str]]]] -class IngestCollectionDTO(BaseModel): - """Representation of a collection for ingest.""" + +class BulkImportCollectionDTO(BaseModel): + """Representation of a collection for bulk import.""" import_id: str title: str @@ -19,7 +20,7 @@ class IngestCollectionDTO(BaseModel): def to_collection_create_dto(self) -> CollectionCreateDTO: """ - Convert IngestCollectionDTO to CollectionCreateDTO. + Convert BulkImportCollectionDTO to CollectionCreateDTO. :return CollectionCreateDTO: Converted CollectionCreateDTO instance. """ @@ -30,21 +31,21 @@ def to_collection_create_dto(self) -> CollectionCreateDTO: ) -class IngestFamilyDTO(BaseModel): - """Representation of a family for ingest.""" +class BulkImportFamilyDTO(BaseModel): + """Representation of a family for bulk import.""" import_id: str title: str summary: str geographies: list[str] category: str - metadata: Json + metadata: Metadata collections: list[str] corpus_import_id: str def to_family_create_dto(self, corpus_import_id: str) -> FamilyCreateDTO: """ - Convert IngestFamilyDTO to FamilyCreateDTO. + Convert BulkImportFamilyDTO to FamilyCreateDTO. :return FamilyCreateDTO: Converted FamilyCreateDTO instance. """ @@ -54,14 +55,14 @@ def to_family_create_dto(self, corpus_import_id: str) -> FamilyCreateDTO: summary=self.summary, geography=self.geographies, category=self.category, - metadata=self.metadata, + metadata=self.metadata.model_dump(), collections=self.collections, corpus_import_id=corpus_import_id, ) -class IngestEventDTO(BaseModel): - """Representation of an event for ingest.""" +class BulkImportEventDTO(BaseModel): + """Representation of an event for bulk import.""" import_id: str family_import_id: str @@ -72,7 +73,7 @@ class IngestEventDTO(BaseModel): def to_event_create_dto(self) -> EventCreateDTO: """ - Convert IngestEventDTO to EventCreateDTO. + Convert BulkImportEventDTO to EventCreateDTO. :return EventCreateDTO: Converted EventCreateDTO instance. """ @@ -85,20 +86,20 @@ def to_event_create_dto(self) -> EventCreateDTO: ) -class IngestDocumentDTO(BaseModel): - """Representation of a document for ingest.""" +class BulkImportDocumentDTO(BaseModel): + """Representation of a document for bulk import.""" import_id: str family_import_id: str variant_name: Optional[str] = None - metadata: Json + metadata: Metadata title: str source_url: Optional[AnyHttpUrl] = None user_language_name: Optional[str] = None def to_document_create_dto(self) -> DocumentCreateDTO: """ - Convert IngestDocumentDTO to DocumentCreateDTO. + Convert BulkImportDocumentDTO to DocumentCreateDTO. :return DocumentCreateDTO: Converted DocumentCreateDTO instance. """ @@ -107,7 +108,7 @@ def to_document_create_dto(self) -> DocumentCreateDTO: import_id=self.import_id, family_import_id=self.family_import_id, variant_name=self.variant_name, - metadata=self.metadata, + metadata=self.metadata.model_dump(), title=self.title, source_url=self.source_url, user_language_name=self.user_language_name, diff --git a/app/repository/family.py b/app/repository/family.py index ffe42476..cb36c279 100644 --- a/app/repository/family.py +++ b/app/repository/family.py @@ -22,7 +22,7 @@ from db_client.models.organisation.users import Organisation from sqlalchemy import Column, and_ from sqlalchemy import delete as db_delete -from sqlalchemy import desc, func, or_ +from sqlalchemy import desc, or_ from sqlalchemy import update as db_update from sqlalchemy.exc import NoResultFound, OperationalError from sqlalchemy.orm import Query, Session @@ -34,30 +34,24 @@ _LOGGER = logging.getLogger(__name__) -FamilyGeoMetaOrg = Tuple[Family, str, FamilyMetadata, Corpus, Organisation] +FamilyGeoMetaOrg = Tuple[Family, Geography, FamilyMetadata, Corpus, Organisation] def _get_query(db: Session) -> Query: # NOTE: SqlAlchemy will make a complete hash of query generation # if columns are used in the query() call. Therefore, entire # objects are returned. - geo_subquery = ( - db.query( - func.min(Geography.value).label("value"), - FamilyGeography.family_import_id, - ) - .join(FamilyGeography, FamilyGeography.geography_id == Geography.id) - .filter(FamilyGeography.family_import_id == Family.import_id) - .group_by(Geography.value, FamilyGeography.family_import_id) - ).subquery("geo_subquery") - return ( - db.query(Family, geo_subquery.c.value, FamilyMetadata, Corpus, Organisation) # type: ignore + db.query(Family, Geography, FamilyMetadata, Corpus, Organisation) + .join(FamilyGeography, FamilyGeography.family_import_id == Family.import_id) + .join( + Geography, + Geography.id == FamilyGeography.geography_id, + ) .join(FamilyMetadata, FamilyMetadata.family_import_id == Family.import_id) .join(FamilyCorpus, FamilyCorpus.family_import_id == Family.import_id) .join(Corpus, Corpus.import_id == FamilyCorpus.corpus_import_id) .join(Organisation, Corpus.organisation_id == Organisation.id) - .filter(geo_subquery.c.family_import_id == Family.import_id) # type: ignore ) @@ -65,14 +59,14 @@ def _family_to_dto( db: Session, fam_geo_meta_corp_org: FamilyGeoMetaOrg ) -> FamilyReadDTO: fam, geo_value, meta, corpus, org = fam_geo_meta_corp_org - metadata = cast(dict, meta.value) org = cast(str, org.name) + return FamilyReadDTO( import_id=str(fam.import_id), title=str(fam.title), summary=str(fam.description), - geography=geo_value, + geography=str(geo_value.value), category=str(fam.family_category), status=str(fam.family_status), metadata=metadata, diff --git a/app/service/ingest.py b/app/service/bulk_import.py similarity index 91% rename from app/service/ingest.py rename to app/service/bulk_import.py index 84288ca2..9b905f9d 100644 --- a/app/service/ingest.py +++ b/app/service/bulk_import.py @@ -1,5 +1,5 @@ """ -Ingest Service +Bulk Import Service This layer uses the corpus, collection, family, document and event repos to handle bulk import of data and other services for validation etc. @@ -27,13 +27,13 @@ import app.service.notification as notification_service import app.service.taxonomy as taxonomy import app.service.validation as validation -from app.clients.aws.s3bucket import upload_ingest_json_to_s3 +from app.clients.aws.s3bucket import upload_bulk_import_json_to_s3 from app.errors import ValidationError -from app.model.ingest import ( - IngestCollectionDTO, - IngestDocumentDTO, - IngestEventDTO, - IngestFamilyDTO, +from app.model.bulk_import import ( + BulkImportCollectionDTO, + BulkImportDocumentDTO, + BulkImportEventDTO, + BulkImportFamilyDTO, ) from app.repository.helpers import generate_slug from app.service.event import ( @@ -41,7 +41,7 @@ get_datetime_event_name_for_corpus, ) -DOCUMENT_INGEST_LIMIT = 1000 +DOCUMENT_BULK_IMPORT_LIMIT = 1000 _LOGGER = logging.getLogger(__name__) _LOGGER.setLevel(logging.DEBUG) @@ -72,7 +72,7 @@ def get_collection_template() -> dict: :return dict: The collection template. """ - collection_schema = IngestCollectionDTO.model_json_schema(mode="serialization") + collection_schema = BulkImportCollectionDTO.model_json_schema(mode="serialization") collection_template = collection_schema["properties"] return collection_template @@ -84,7 +84,7 @@ def get_event_template(corpus_type: str) -> dict: :return dict: The event template. """ - event_schema = IngestEventDTO.model_json_schema(mode="serialization") + event_schema = BulkImportEventDTO.model_json_schema(mode="serialization") event_template = event_schema["properties"] event_meta = get_metadata_template(corpus_type, CountedEntity.Event) @@ -104,7 +104,7 @@ def get_document_template(corpus_type: str) -> dict: :param str corpus_type: The corpus_type to use to get the document template. :return dict: The document template. """ - document_schema = IngestDocumentDTO.model_json_schema(mode="serialization") + document_schema = BulkImportDocumentDTO.model_json_schema(mode="serialization") document_template = document_schema["properties"] document_template["metadata"] = get_metadata_template( corpus_type, CountedEntity.Document @@ -142,7 +142,7 @@ def get_family_template(corpus_type: str) -> dict: :param str corpus_type: The corpus_type to use to get the family template. :return dict: The family template. """ - family_schema = IngestFamilyDTO.model_json_schema(mode="serialization") + family_schema = BulkImportFamilyDTO.model_json_schema(mode="serialization") family_template = family_schema["properties"] del family_template["corpus_import_id"] @@ -179,7 +179,7 @@ def save_collections( for coll in collection_data: if not _exists_in_db(Collection, coll["import_id"], db): _LOGGER.info(f"Importing collection {coll['import_id']}") - dto = IngestCollectionDTO(**coll).to_collection_create_dto() + dto = BulkImportCollectionDTO(**coll).to_collection_create_dto() import_id = collection_repository.create(db, dto, org_id) collection_import_ids.append(import_id) total_collections_saved += 1 @@ -215,7 +215,7 @@ def save_families( for fam in family_data: if not _exists_in_db(Family, fam["import_id"], db): _LOGGER.info(f"Importing family {fam['import_id']}") - dto = IngestFamilyDTO( + dto = BulkImportFamilyDTO( **fam, corpus_import_id=corpus_import_id ).to_family_create_dto(corpus_import_id) geo_ids = [] @@ -256,10 +256,10 @@ def save_documents( for doc in document_data: if ( not _exists_in_db(FamilyDocument, doc["import_id"], db) - and total_documents_saved < DOCUMENT_INGEST_LIMIT + and total_documents_saved < DOCUMENT_BULK_IMPORT_LIMIT ): _LOGGER.info(f"Importing document {doc['import_id']}") - dto = IngestDocumentDTO(**doc).to_document_create_dto() + dto = BulkImportDocumentDTO(**doc).to_document_create_dto() slug = generate_slug(db=db, title=dto.title, created_slugs=document_slugs) import_id = document_repository.create(db, dto, slug) document_slugs.add(slug) @@ -296,7 +296,7 @@ def save_events( for event in event_data: if not _exists_in_db(FamilyEvent, event["import_id"], db): _LOGGER.info(f"Importing event {event['import_id']}") - dto = IngestEventDTO(**event).to_event_create_dto() + dto = BulkImportEventDTO(**event).to_event_create_dto() event_metadata = create_event_metadata_object( db, corpus_import_id, event["event_type_value"], datetime_event_name ) @@ -323,8 +323,8 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None: ) end_message = "" - ingest_uuid = uuid4() - upload_ingest_json_to_s3(f"{ingest_uuid}-request", corpus_import_id, data) + import_uuid = uuid4() + upload_bulk_import_json_to_s3(f"{import_uuid}-request", corpus_import_id, data) _LOGGER.info("Getting DB session") @@ -353,7 +353,7 @@ def import_data(data: dict[str, Any], corpus_import_id: str) -> None: _LOGGER.info("Saving events") result["events"] = save_events(event_data, corpus_import_id, db) - upload_ingest_json_to_s3(f"{ingest_uuid}-result", corpus_import_id, result) + upload_bulk_import_json_to_s3(f"{import_uuid}-result", corpus_import_id, result) end_message = ( f"🎉 Bulk import for corpus: {corpus_import_id} successfully completed." diff --git a/app/service/validation.py b/app/service/validation.py index 88e67668..439020fb 100644 --- a/app/service/validation.py +++ b/app/service/validation.py @@ -14,8 +14,8 @@ from app.service.event import create_event_metadata_object -class IngestEntityList(str, Enum): - """Name of the list of entities that can be ingested.""" +class BulkImportEntityList(str, Enum): + """Name of the list of entities that can be imported.""" Collections = "collections" Families = "families" @@ -143,14 +143,14 @@ def validate_events(events: list[dict[str, Any]], corpus_import_id: str) -> None def _collect_import_ids( - entity_list_name: IngestEntityList, + entity_list_name: BulkImportEntityList, data: dict[str, Any], import_id_type_name: Optional[str] = None, ) -> list[str]: """ Extracts a list of import_ids (or family_import_ids if specified) for the specified entity list in data. - :param IngestEntityList entity_list_name: The name of the entity list from which the import_ids are to be extracted. + :param BulkImportEntityList entity_list_name: The name of the entity list from which the import_ids are to be extracted. :param dict[str, Any] data: The data structure containing the entity lists used for extraction. :param Optional[str] import_id_type_name: the name of the type of import_id to be extracted or None. :return list[str]: A list of extracted import_ids for the specified entity list. @@ -184,7 +184,7 @@ def _validate_collections_exist_for_families(data: dict[str, Any]) -> None: :param dict[str, Any] data: The data object containing entities to be validated. """ - collections = _collect_import_ids(IngestEntityList.Collections, data) + collections = _collect_import_ids(BulkImportEntityList.Collections, data) collections_set = set(collections) family_collection_import_ids = [] @@ -202,14 +202,14 @@ def _validate_families_exist_for_events_and_documents(data: dict[str, Any]) -> N :param dict[str, Any] data: The data object containing entities to be validated. """ - families = _collect_import_ids(IngestEntityList.Families, data) + families = _collect_import_ids(BulkImportEntityList.Families, data) families_set = set(families) document_family_import_ids = _collect_import_ids( - IngestEntityList.Documents, data, "family_import_id" + BulkImportEntityList.Documents, data, "family_import_id" ) event_family_import_ids = _collect_import_ids( - IngestEntityList.Events, data, "family_import_id" + BulkImportEntityList.Events, data, "family_import_id" ) _match_import_ids(document_family_import_ids, families_set) @@ -228,9 +228,9 @@ def validate_entity_relationships(data: dict[str, Any]) -> None: _validate_families_exist_for_events_and_documents(data) -def validate_ingest_data(data: dict[str, Any]) -> None: +def validate_bulk_import_data(data: dict[str, Any]) -> None: """ - Validates data to be ingested. + Validates data to be bulk imported. :param dict[str, Any] data: The data object to be validated. :raises HTTPException: raised if data is empty or None. diff --git a/docker-compose-fulldata.yml b/docker-compose-fulldata.yml new file mode 100644 index 00000000..e69de29b diff --git a/docker-compose.yml b/docker-compose.yml index f61d953a..bcdee019 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -47,5 +47,56 @@ services: localstack: condition: service_started + pgadmin: + image: dpage/pgadmin4 + container_name: pgadmin4 + restart: always + env_file: + - .env + ports: + - 8000:80 + # This has to have world & group access denied to work + entrypoint: /bin/sh -c "chmod 600 /pgpass; /entrypoint.sh;" + configs: + - source: servers.json + target: /pgadmin4/servers.json + - source: pgpass + target: /pgpass + environment: + # This deavctivates the login screen + PGADMIN_CONFIG_SERVER_MODE: "False" + PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED: "False" + volumes: + - pgadmin-data:/var/lib/pgadmin + # This is just nice to know as if it fails, it fails quietly + healthcheck: + # there are conflicting trunk errors here. Either + # - having `:` in the URL (fails in Github actions) + # - redundantly quoting the URL (fails locally) + # trunk-ignore(yamllint/quoted-strings) + test: [CMD, wget, -O, "-", "http://localhost:80/misc/ping"] +configs: + pgpass: + content: "admin_backend_db:5432:*:${POSTGRES_USER}:${POSTGRES_PASSWORD}" + servers.json: + content: | + { + "Servers": { + "1": { + "Group": "Servers", + "Name": "Navigator Admin Backend", + "Host": "admin_backend_db", + "Port": 5432, + "MaintenanceDB": "postgres", + "Username": "${POSTGRES_USER}", + "Password": "${POSTGRES_PASSWORD}", + "SSLMode": "prefer" + } + } + } + localstack: + condition: service_started + volumes: admin-data: {} + pgadmin-data: {} diff --git a/pyproject.toml b/pyproject.toml index 3290b087..a0463a9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "admin_backend" -version = "2.17.17" +version = "2.17.20" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }] diff --git a/tests/helpers/ingest.py b/tests/helpers/bulk_import.py similarity index 100% rename from tests/helpers/ingest.py rename to tests/helpers/bulk_import.py diff --git a/tests/integration_tests/ingest/test_ingest.py b/tests/integration_tests/bulk_import/test_bulk_import.py similarity index 86% rename from tests/integration_tests/ingest/test_ingest.py rename to tests/integration_tests/bulk_import/test_bulk_import.py index b5db7a75..afa185b1 100644 --- a/tests/integration_tests/ingest/test_ingest.py +++ b/tests/integration_tests/bulk_import/test_bulk_import.py @@ -9,7 +9,7 @@ from sqlalchemy import update from sqlalchemy.orm import Session -from tests.helpers.ingest import ( +from tests.helpers.bulk_import import ( build_json_file, default_collection, default_document, @@ -54,12 +54,14 @@ def create_input_json_with_two_of_each_entity(): @pytest.mark.s3 -def test_ingest_when_ok(data_db: Session, client: TestClient, superuser_header_token): +def test_bulk_import_when_ok( + data_db: Session, client: TestClient, superuser_header_token +): input_json = create_input_json_with_two_of_each_entity() response = client.post( - "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", - files={"new_data": input_json}, + "/api/v1/bulk-import/UNFCCC.corpus.i00000001.n0000", + files={"data": input_json}, headers=superuser_header_token, ) @@ -128,8 +130,8 @@ def test_import_data_rollback( with caplog.at_level(logging.ERROR): response = client.post( - "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", - files={"new_data": input_json}, + "/api/v1/bulk-import/UNFCCC.corpus.i00000001.n0000", + files={"data": input_json}, headers=superuser_header_token, ) @@ -148,7 +150,7 @@ def test_import_data_rollback( @pytest.mark.s3 -def test_ingest_idempotency( +def test_bulk_import_idempotency( caplog, data_db: Session, client: TestClient, @@ -168,8 +170,8 @@ def test_ingest_idempotency( with caplog.at_level(logging.ERROR): first_response = client.post( - "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", - files={"new_data": input_json}, + "/api/v1/bulk-import/UNFCCC.corpus.i00000001.n0000", + files={"data": input_json}, headers=superuser_header_token, ) @@ -201,8 +203,8 @@ def test_ingest_idempotency( with caplog.at_level(logging.ERROR): second_response = client.post( - "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", - files={"new_data": input_json}, + "/api/v1/bulk-import/UNFCCC.corpus.i00000001.n0000", + files={"data": input_json}, headers=superuser_header_token, ) @@ -255,8 +257,8 @@ def test_generates_unique_slugs_for_documents_with_identical_titles( with caplog.at_level(logging.ERROR): response = client.post( - "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", - files={"new_data": input_json}, + "/api/v1/bulk-import/UNFCCC.corpus.i00000001.n0000", + files={"data": input_json}, headers=superuser_header_token, ) @@ -275,7 +277,7 @@ def test_generates_unique_slugs_for_documents_with_identical_titles( @pytest.mark.s3 -def test_ingest_when_corpus_import_id_invalid( +def test_bulk_import_when_corpus_import_id_invalid( caplog, data_db: Session, client: TestClient, @@ -286,8 +288,8 @@ def test_ingest_when_corpus_import_id_invalid( with caplog.at_level(logging.ERROR): response = client.post( - f"/api/v1/ingest/{invalid_corpus}", - files={"new_data": input_json}, + f"/api/v1/bulk-import/{invalid_corpus}", + files={"data": input_json}, headers=superuser_header_token, ) @@ -300,7 +302,7 @@ def test_ingest_when_corpus_import_id_invalid( @pytest.mark.s3 -def test_ingest_events_when_event_type_invalid( +def test_bulk_import_events_when_event_type_invalid( caplog, data_db: Session, client: TestClient, @@ -317,8 +319,8 @@ def test_ingest_events_when_event_type_invalid( with caplog.at_level(logging.ERROR): response = client.post( - "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", - files={"new_data": input_json}, + "/api/v1/bulk-import/UNFCCC.corpus.i00000001.n0000", + files={"data": input_json}, headers=superuser_header_token, ) @@ -333,32 +335,36 @@ def test_ingest_events_when_event_type_invalid( ) -def test_ingest_when_not_authorised(client: TestClient, data_db: Session): +def test_bulk_import_when_not_authorised(client: TestClient, data_db: Session): response = client.post( - "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", + "/api/v1/bulk-import/UNFCCC.corpus.i00000001.n0000", ) assert response.status_code == status.HTTP_401_UNAUTHORIZED -def test_ingest_admin_non_super( +def test_bulk_import_admin_non_super( client: TestClient, data_db: Session, admin_user_header_token ): response = client.post( - "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", + "/api/v1/bulk-import/UNFCCC.corpus.i00000001.n0000", headers=admin_user_header_token, ) assert response.status_code == status.HTTP_403_FORBIDDEN data = response.json() - assert data["detail"] == "User admin@cpr.org is not authorised to CREATE an INGEST" + assert ( + data["detail"] == "User admin@cpr.org is not authorised to CREATE a BULK-IMPORT" + ) -def test_ingest_non_super_non_admin( +def test_bulk_import_non_super_non_admin( client: TestClient, data_db: Session, user_header_token ): response = client.post( - "/api/v1/ingest/UNFCCC.corpus.i00000001.n0000", + "/api/v1/bulk-import/UNFCCC.corpus.i00000001.n0000", headers=user_header_token, ) assert response.status_code == status.HTTP_403_FORBIDDEN data = response.json() - assert data["detail"] == "User cclw@cpr.org is not authorised to CREATE an INGEST" + assert ( + data["detail"] == "User cclw@cpr.org is not authorised to CREATE a BULK-IMPORT" + ) diff --git a/tests/integration_tests/ingest/test_ingest_template.py b/tests/integration_tests/bulk_import/test_bulk_import_template.py similarity index 96% rename from tests/integration_tests/ingest/test_ingest_template.py rename to tests/integration_tests/bulk_import/test_bulk_import_template.py index 8fc6f0b4..f31d5092 100644 --- a/tests/integration_tests/ingest/test_ingest_template.py +++ b/tests/integration_tests/bulk_import/test_bulk_import_template.py @@ -7,7 +7,7 @@ def test_get_template_unfccc( data_db: Session, client: TestClient, superuser_header_token ): response = client.get( - "/api/v1/ingest/template/Intl. agreements", headers=superuser_header_token + "/api/v1/bulk-import/template/Intl. agreements", headers=superuser_header_token ) assert response.status_code == status.HTTP_200_OK @@ -222,7 +222,7 @@ def test_get_template_unfccc( def test_get_template_when_not_authorised(client: TestClient, data_db: Session): response = client.get( - "/api/v1/ingest/template/Intl. agreements", + "/api/v1/bulk-import/template/Intl. agreements", ) assert response.status_code == status.HTTP_401_UNAUTHORIZED @@ -231,21 +231,23 @@ def test_get_template_admin_non_super( data_db: Session, client: TestClient, admin_user_header_token ): response = client.get( - "/api/v1/ingest/template/Intl. agreements", + "/api/v1/bulk-import/template/Intl. agreements", headers=admin_user_header_token, ) assert response.status_code == status.HTTP_403_FORBIDDEN data = response.json() - assert data["detail"] == "User admin@cpr.org is not authorised to READ an INGEST" + assert ( + data["detail"] == "User admin@cpr.org is not authorised to READ a BULK-IMPORT" + ) def test_get_template_non_admin_non_super( data_db: Session, client: TestClient, user_header_token ): response = client.get( - "/api/v1/ingest/template/Intl. agreements", + "/api/v1/bulk-import/template/Intl. agreements", headers=user_header_token, ) assert response.status_code == status.HTTP_403_FORBIDDEN data = response.json() - assert data["detail"] == "User cclw@cpr.org is not authorised to READ an INGEST" + assert data["detail"] == "User cclw@cpr.org is not authorised to READ a BULK-IMPORT" diff --git a/tests/integration_tests/family/test_search.py b/tests/integration_tests/family/test_search.py index c3e7d361..0eff7d47 100644 --- a/tests/integration_tests/family/test_search.py +++ b/tests/integration_tests/family/test_search.py @@ -7,6 +7,27 @@ from tests.integration_tests.setup_db import setup_db +def test_search_geographies( + client: TestClient, data_db: Session, superuser_header_token +): + setup_db(data_db) + + tests_cases = [ + ("afghanistan", 2), + ("zimbabwe", 1), + ] + + for country, expected_count in tests_cases: + response = client.get( + f"/api/v1/families/?geography={country}", + headers=superuser_header_token, + ) + assert response.status_code == status.HTTP_200_OK + data = response.json() + assert isinstance(data, list) + assert len(data) == expected_count + + def test_search_family_super( client: TestClient, data_db: Session, superuser_header_token ): diff --git a/tests/integration_tests/family/test_update.py b/tests/integration_tests/family/test_update.py index 958873e2..495a0889 100644 --- a/tests/integration_tests/family/test_update.py +++ b/tests/integration_tests/family/test_update.py @@ -232,7 +232,7 @@ def test_update_family_collections_to_one_that_does_not_exist( assert db_family.description == "" expected_geo = ( - data_db.query(Geography).filter(Geography.display_value == "Other").one() + data_db.query(Geography).filter(Geography.display_value == "Afghanistan").one() ) assert expected_geo.id in [g.id for g in db_family.geographies] assert db_family.family_category == "UNFCCC" @@ -279,7 +279,11 @@ def test_update_fails_family_when_user_org_different_to_family_org( assert db_family.description == "apple" assert db_family.family_category == "UNFCCC" - geo_id = data_db.query(Geography.id).filter(Geography.value == "Other").scalar() + geo_id = ( + data_db.query(Geography.id) + # TODO: PDCT-1406: Properly implement multi-geography support + .filter(Geography.value == db_family.geographies[0].value).scalar() + ) assert geo_id in [g.id for g in db_family.geographies] @@ -355,7 +359,11 @@ def test_update_family_when_collection_org_different_to_family_org( assert db_family.description == "" assert db_family.family_category == "UNFCCC" - geo_id = data_db.query(Geography.id).filter(Geography.value == "Other").scalar() + geo_id = ( + data_db.query(Geography.id) + # TODO: PDCT-1406: Properly implement multi-geography support + .filter(Geography.value == db_family.geographies[0].value).scalar() + ) assert geo_id in [g.id for g in db_family.geographies] @@ -375,28 +383,32 @@ def test_update_family_idempotent_when_ok( client: TestClient, data_db: Session, user_header_token ): setup_db(data_db) - family = EXPECTED_FAMILIES[1] + expected_family = EXPECTED_FAMILIES[1] response = client.put( - f"/api/v1/families/{family['import_id']}", - json=family, + f"/api/v1/families/{expected_family['import_id']}", + json=expected_family, headers=user_header_token, ) assert response.status_code == status.HTTP_200_OK data = response.json() - assert data["title"] == EXPECTED_FAMILIES[1]["title"] - assert data["summary"] == EXPECTED_FAMILIES[1]["summary"] - assert data["geography"] == EXPECTED_FAMILIES[1]["geography"] - assert data["category"] == EXPECTED_FAMILIES[1]["category"] + assert data["title"] == expected_family["title"] + assert data["summary"] == expected_family["summary"] + assert data["geography"] == expected_family["geography"] + assert data["category"] == expected_family["category"] db_family: Family = ( data_db.query(Family) - .filter(Family.import_id == EXPECTED_FAMILIES[1]["import_id"]) + .filter(Family.import_id == expected_family["import_id"]) .one() ) - assert db_family.title == EXPECTED_FAMILIES[1]["title"] - assert db_family.description == EXPECTED_FAMILIES[1]["summary"] - assert db_family.family_category == EXPECTED_FAMILIES[1]["category"] + assert db_family.title == expected_family["title"] + assert db_family.description == expected_family["summary"] + assert db_family.family_category == expected_family["category"] - geo_id = data_db.query(Geography.id).filter(Geography.value == "Other").scalar() + geo_id = ( + data_db.query(Geography.id) + .filter(Geography.value == expected_family["geography"]) + .scalar() + ) assert geo_id in [g.id for g in db_family.geographies] diff --git a/tests/integration_tests/setup_db.py b/tests/integration_tests/setup_db.py index ad112ee2..3c730507 100644 --- a/tests/integration_tests/setup_db.py +++ b/tests/integration_tests/setup_db.py @@ -31,7 +31,7 @@ "import_id": "A.0.0.1", "title": "apple", "summary": "", - "geography": "Other", + "geography": "AFG", "category": "UNFCCC", "status": "Created", "metadata": { @@ -57,7 +57,7 @@ "import_id": "A.0.0.2", "title": "apple orange banana", "summary": "apple", - "geography": "Other", + "geography": "ZWE", "category": "UNFCCC", "status": "Created", "metadata": { @@ -83,7 +83,7 @@ "import_id": "A.0.0.3", "title": "title", "summary": "orange peas", - "geography": "Other", + "geography": "AFG", "category": "UNFCCC", "status": "Created", "metadata": {"author": "CPR", "author_type": "Party"}, diff --git a/tests/mocks/services/ingest_service.py b/tests/mocks/services/bulk_import_service.py similarity index 78% rename from tests/mocks/services/ingest_service.py rename to tests/mocks/services/bulk_import_service.py index a9d1d4d1..67314984 100644 --- a/tests/mocks/services/ingest_service.py +++ b/tests/mocks/services/bulk_import_service.py @@ -5,11 +5,11 @@ from app.errors import RepositoryError -def mock_ingest_service(ingest_service, monkeypatch: MonkeyPatch, mocker): - ingest_service.throw_repository_error = False +def mock_bulk_import_service(bulk_import_service, monkeypatch: MonkeyPatch, mocker): + bulk_import_service.throw_repository_error = False def maybe_throw(): - if ingest_service.throw_repository_error: + if bulk_import_service.throw_repository_error: raise RepositoryError("bad repo") def mock_import_data( @@ -39,5 +39,5 @@ def mock_import_data( return response - monkeypatch.setattr(ingest_service, "import_data", mock_import_data) - mocker.spy(ingest_service, "import_data") + monkeypatch.setattr(bulk_import_service, "import_data", mock_import_data) + mocker.spy(bulk_import_service, "import_data") diff --git a/tests/unit_tests/clients/aws/test_client.py b/tests/unit_tests/clients/aws/test_client.py index b3dcb7d5..40903000 100644 --- a/tests/unit_tests/clients/aws/test_client.py +++ b/tests/unit_tests/clients/aws/test_client.py @@ -7,7 +7,7 @@ from app.clients.aws.s3bucket import ( S3UploadContext, - upload_ingest_json_to_s3, + upload_bulk_import_json_to_s3, upload_json_to_s3, ) @@ -40,9 +40,9 @@ def test_upload_json_to_s3_when_error(basic_s3_client): @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) -def test_upload_ingest_json_to_s3_success(basic_s3_client): +def test_upload_bulk_import_json_to_s3_success(basic_s3_client): json_data = {"test": "test"} - upload_ingest_json_to_s3("1111-1111", "test_corpus_id", json_data) + upload_bulk_import_json_to_s3("1111-1111", "test_corpus_id", json_data) find_response = basic_s3_client.list_objects_v2( Bucket="test_bucket", Prefix="1111-1111-test_corpus_id" diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py index 681f38b5..f7ac2c5e 100644 --- a/tests/unit_tests/conftest.py +++ b/tests/unit_tests/conftest.py @@ -18,13 +18,13 @@ import app.service.analytics as analytics_service import app.service.app_user as app_user_service +import app.service.bulk_import as bulk_import_service import app.service.collection as collection_service import app.service.config as config_service import app.service.corpus as corpus_service import app.service.document as document_service import app.service.event as event_service import app.service.family as family_service -import app.service.ingest as ingest_service import app.service.taxonomy as taxonomy_service import app.service.token as token_service import app.service.validation as validation_service @@ -55,13 +55,13 @@ from tests.mocks.repos.organisation_repo import mock_organisation_repo from tests.mocks.services.analytics_service import mock_analytics_service from tests.mocks.services.app_user_service import mock_app_user_service +from tests.mocks.services.bulk_import_service import mock_bulk_import_service from tests.mocks.services.collection_service import mock_collection_service from tests.mocks.services.config_service import mock_config_service from tests.mocks.services.corpus_service import mock_corpus_service from tests.mocks.services.document_service import mock_document_service from tests.mocks.services.event_service import mock_event_service from tests.mocks.services.family_service import mock_family_service -from tests.mocks.services.ingest_service import mock_ingest_service from tests.mocks.services.validation_service import mock_validation_service ORG_ID = 1 @@ -222,10 +222,10 @@ def corpus_service_mock(monkeypatch, mocker): @pytest.fixture -def ingest_service_mock(monkeypatch, mocker): +def bulk_import_service_mock(monkeypatch, mocker): """Mocks the service for a single test.""" - mock_ingest_service(ingest_service, monkeypatch, mocker) - yield ingest_service + mock_bulk_import_service(bulk_import_service, monkeypatch, mocker) + yield bulk_import_service @pytest.fixture diff --git a/tests/unit_tests/routers/ingest/test_bulk_ingest.py b/tests/unit_tests/routers/bulk_import/test_bulk_import.py similarity index 75% rename from tests/unit_tests/routers/ingest/test_bulk_ingest.py rename to tests/unit_tests/routers/bulk_import/test_bulk_import.py index b1dd8291..13e7360f 100644 --- a/tests/unit_tests/routers/ingest/test_bulk_ingest.py +++ b/tests/unit_tests/routers/bulk_import/test_bulk_import.py @@ -5,7 +5,7 @@ from fastapi import status from fastapi.testclient import TestClient -from tests.helpers.ingest import ( +from tests.helpers.bulk_import import ( build_json_file, default_collection, default_document, @@ -51,29 +51,29 @@ def create_input_json_with_two_of_each_entity(): ) -def test_ingest_when_not_authenticated(client: TestClient): - response = client.post("/api/v1/ingest/test") +def test_bulk_import_when_not_authenticated(client: TestClient): + response = client.post("/api/v1/bulk-import/test") assert response.status_code == status.HTTP_401_UNAUTHORIZED -def test_ingest_when_non_admin_non_super(client: TestClient, user_header_token): - response = client.post("/api/v1/ingest/test", headers=user_header_token) +def test_bulk_import_when_non_admin_non_super(client: TestClient, user_header_token): + response = client.post("/api/v1/bulk-import/test", headers=user_header_token) assert response.status_code == status.HTTP_403_FORBIDDEN -def test_ingest_when_admin_non_super(client: TestClient, admin_user_header_token): - response = client.post("/api/v1/ingest/test", headers=admin_user_header_token) +def test_bulk_import_when_admin_non_super(client: TestClient, admin_user_header_token): + response = client.post("/api/v1/bulk-import/test", headers=admin_user_header_token) assert response.status_code == status.HTTP_403_FORBIDDEN -def test_ingest_data_when_ok(client: TestClient, superuser_header_token): +def test_bulk_import_data_when_ok(client: TestClient, superuser_header_token): corpus_import_id = "test" input_json = create_input_json_with_two_of_each_entity() with patch("fastapi.BackgroundTasks.add_task") as background_task_mock: response = client.post( - f"/api/v1/ingest/{corpus_import_id}", - files={"new_data": input_json}, + f"/api/v1/bulk-import/{corpus_import_id}", + files={"data": input_json}, headers=superuser_header_token, ) @@ -85,7 +85,7 @@ def test_ingest_data_when_ok(client: TestClient, superuser_header_token): } -def test_ingest_when_no_data( +def test_bulk_import_when_no_data( client: TestClient, superuser_header_token, collection_repo_mock, @@ -95,8 +95,8 @@ def test_ingest_when_no_data( test_data = json.dumps({}).encode("utf-8") test_data_file = io.BytesIO(test_data) response = client.post( - "/api/v1/ingest/test", - files={"new_data": test_data_file}, + "/api/v1/bulk-import/test", + files={"data": test_data_file}, headers=superuser_header_token, ) @@ -105,12 +105,14 @@ def test_ingest_when_no_data( assert response.status_code == status.HTTP_204_NO_CONTENT -def test_ingest_documents_when_no_family(client: TestClient, superuser_header_token): +def test_bulk_import_documents_when_no_family( + client: TestClient, superuser_header_token +): json_input = build_json_file({"documents": [default_document]}) response = client.post( - "/api/v1/ingest/test", - files={"new_data": json_input}, + "/api/v1/bulk-import/test", + files={"data": json_input}, headers=superuser_header_token, ) diff --git a/tests/unit_tests/routers/ingest/test_get_ingest_template.py b/tests/unit_tests/routers/bulk_import/test_get_bulk_import_template.py similarity index 87% rename from tests/unit_tests/routers/ingest/test_get_ingest_template.py rename to tests/unit_tests/routers/bulk_import/test_get_bulk_import_template.py index ea3b7394..4983f4b2 100644 --- a/tests/unit_tests/routers/ingest/test_get_ingest_template.py +++ b/tests/unit_tests/routers/bulk_import/test_get_bulk_import_template.py @@ -1,5 +1,5 @@ """ -Tests the route for retrieving a data ingest template by corpus. +Tests the route for retrieving a bulk import template by corpus. This uses service mocks and ensures the endpoint calls into each service. """ @@ -8,36 +8,36 @@ from fastapi.testclient import TestClient -def test_ingest_template_when_not_authenticated(client: TestClient): +def test_bulk_import_template_when_not_authenticated(client: TestClient): response = client.get( - "/api/v1/ingest/template/test_corpus_type", + "/api/v1/bulk-import/template/test_corpus_type", ) assert response.status_code == status.HTTP_401_UNAUTHORIZED -def test_ingest_template_when_non_admin_non_super( +def test_bulk_import_template_when_non_admin_non_super( client: TestClient, user_header_token ): response = client.get( - "/api/v1/ingest/template/test_corpus_type", headers=user_header_token + "/api/v1/bulk-import/template/test_corpus_type", headers=user_header_token ) assert response.status_code == status.HTTP_403_FORBIDDEN -def test_ingest_template_when_admin_non_super( +def test_bulk_import_template_when_admin_non_super( client: TestClient, admin_user_header_token ): response = client.get( - "/api/v1/ingest/template/test_corpus_type", headers=admin_user_header_token + "/api/v1/bulk-import/template/test_corpus_type", headers=admin_user_header_token ) assert response.status_code == status.HTTP_403_FORBIDDEN -def test_ingest_template_when_ok( +def test_bulk_import_template_when_ok( client: TestClient, superuser_header_token, db_client_corpus_helpers_mock ): response = client.get( - "/api/v1/ingest/template/test_corpus_type", + "/api/v1/bulk-import/template/test_corpus_type", headers=superuser_header_token, ) diff --git a/tests/unit_tests/service/ingest/test_ingest_service.py b/tests/unit_tests/service/bulk_import/test_bulk_import_service.py similarity index 68% rename from tests/unit_tests/service/ingest/test_ingest_service.py rename to tests/unit_tests/service/bulk_import/test_bulk_import_service.py index 2eb1407c..1c427a80 100644 --- a/tests/unit_tests/service/ingest/test_ingest_service.py +++ b/tests/unit_tests/service/bulk_import/test_bulk_import_service.py @@ -5,13 +5,14 @@ import pytest -import app.service.ingest as ingest_service +import app.service.bulk_import as bulk_import_service from app.errors import ValidationError +from tests.helpers.bulk_import import default_document, default_family -@patch("app.service.ingest.uuid4", Mock(return_value="1111-1111")) +@patch("app.service.bulk_import.uuid4", Mock(return_value="1111-1111")) @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) -@patch("app.service.ingest._exists_in_db", Mock(return_value=False)) +@patch("app.service.bulk_import._exists_in_db", Mock(return_value=False)) def test_input_json_and_result_saved_to_s3_on_bulk_import( basic_s3_client, validation_service_mock, corpus_repo_mock, collection_repo_mock ): @@ -26,7 +27,7 @@ def test_input_json_and_result_saved_to_s3_on_bulk_import( ] } - ingest_service.import_data(json_data, "test_corpus_id") + bulk_import_service.import_data(json_data, "test_corpus_id") bulk_import_input_json = basic_s3_client.list_objects_v2( Bucket=bucket_name, Prefix="1111-1111-result-test_corpus_id" @@ -40,7 +41,7 @@ def test_input_json_and_result_saved_to_s3_on_bulk_import( assert {"collections": ["test.new.collection.0"]} == json.loads(body) -@patch("app.service.ingest._exists_in_db", Mock(return_value=False)) +@patch("app.service.bulk_import._exists_in_db", Mock(return_value=False)) @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) def test_slack_notification_sent_on_success( basic_s3_client, @@ -60,10 +61,10 @@ def test_slack_notification_sent_on_success( with ( patch( - "app.service.ingest.notification_service.send_notification" + "app.service.bulk_import.notification_service.send_notification" ) as mock_notification_service, ): - ingest_service.import_data(test_data, "test_corpus_id") + bulk_import_service.import_data(test_data, "test_corpus_id") assert 2 == mock_notification_service.call_count mock_notification_service.assert_called_with( @@ -71,7 +72,7 @@ def test_slack_notification_sent_on_success( ) -@patch("app.service.ingest._exists_in_db", Mock(return_value=False)) +@patch("app.service.bulk_import._exists_in_db", Mock(return_value=False)) @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) def test_slack_notification_sent_on_error(caplog, basic_s3_client, corpus_repo_mock): corpus_repo_mock.error = True @@ -89,10 +90,10 @@ def test_slack_notification_sent_on_error(caplog, basic_s3_client, corpus_repo_m with ( caplog.at_level(logging.ERROR), patch( - "app.service.ingest.notification_service.send_notification" + "app.service.bulk_import.notification_service.send_notification" ) as mock_notification_service, ): - ingest_service.import_data(test_data, "test") + bulk_import_service.import_data(test_data, "test") assert 2 == mock_notification_service.call_count mock_notification_service.assert_called_with( @@ -104,6 +105,30 @@ def test_slack_notification_sent_on_error(caplog, basic_s3_client, corpus_repo_m ) +@pytest.mark.parametrize( + "test_data", + [ + {"families": [{**default_family, "metadata": {"key": [1]}}]}, + {"families": [{**default_family, "metadata": {"key": None}}]}, + {"families": [{**default_family, "metadata": {"key": 1}}]}, + {"documents": [{**default_document, "metadata": {"key": 1}}]}, + ], +) +@patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) +@patch("app.service.bulk_import._exists_in_db", Mock(return_value=False)) +def test_import_data_when_metadata_contains_non_string_values( + test_data, + corpus_repo_mock, + validation_service_mock, + caplog, + basic_s3_client, +): + with caplog.at_level(logging.ERROR): + bulk_import_service.import_data(test_data, "test") + + assert "Input should be a valid string" in caplog.text + + @patch.dict(os.environ, {"BULK_IMPORT_BUCKET": "test_bucket"}) def test_import_data_when_data_invalid(caplog, basic_s3_client): test_data = { @@ -117,7 +142,7 @@ def test_import_data_when_data_invalid(caplog, basic_s3_client): } with caplog.at_level(logging.ERROR): - ingest_service.import_data(test_data, "test") + bulk_import_service.import_data(test_data, "test") assert "The import id invalid is invalid!" in caplog.text @@ -128,7 +153,7 @@ def test_save_families_when_corpus_invalid(corpus_repo_mock, validation_service_ test_data = [{"import_id": "test.new.family.0"}] with pytest.raises(ValidationError) as e: - ingest_service.save_families(test_data, "test") + bulk_import_service.save_families(test_data, "test") assert "No organisation associated with corpus test" == e.value.message @@ -137,7 +162,7 @@ def test_save_families_when_data_invalid(corpus_repo_mock, validation_service_mo test_data = [{"import_id": "invalid"}] with pytest.raises(ValidationError) as e: - ingest_service.save_families(test_data, "test") + bulk_import_service.save_families(test_data, "test") assert "Error" == e.value.message @@ -147,16 +172,16 @@ def test_save_documents_when_data_invalid(validation_service_mock): test_data = [{"import_id": "invalid"}] with pytest.raises(ValidationError) as e: - ingest_service.save_documents(test_data, "test") + bulk_import_service.save_documents(test_data, "test") assert "Error" == e.value.message -@patch("app.service.ingest.generate_slug", Mock(return_value="test-slug_1234")) -@patch("app.service.ingest._exists_in_db", Mock(return_value=False)) -def test_do_not_save_documents_over_ingest_limit( +@patch("app.service.bulk_import.generate_slug", Mock(return_value="test-slug_1234")) +@patch("app.service.bulk_import._exists_in_db", Mock(return_value=False)) +def test_do_not_save_documents_over_bulk_import_limit( validation_service_mock, document_repo_mock, monkeypatch ): - monkeypatch.setattr(ingest_service, "DOCUMENT_INGEST_LIMIT", 1) + monkeypatch.setattr(bulk_import_service, "DOCUMENT_BULK_IMPORT_LIMIT", 1) test_data = [ { @@ -179,7 +204,7 @@ def test_do_not_save_documents_over_ingest_limit( }, ] - saved_documents = ingest_service.save_documents(test_data, "test") + saved_documents = bulk_import_service.save_documents(test_data, "test") assert ["test.new.document.0"] == saved_documents @@ -189,5 +214,5 @@ def test_save_events_when_data_invalid(validation_service_mock): test_data = [{"import_id": "invalid"}] with pytest.raises(ValidationError) as e: - ingest_service.save_events(test_data, "test") + bulk_import_service.save_events(test_data, "test") assert "Error" == e.value.message