diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 9a66c0e8..8c4c0200 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -23,3 +23,7 @@ # Updating the test data file for document passages to be indent=2 44624dcd1fa0835708bd9187a39bb0da8a31cd03 + +# Fix SQL query formatting +047766a85f086fc0986a6f2b49fee9d73fa219e8 +ab3476708920c5760f058ec40d14d008f94f5bad diff --git a/.trunk/configs/.sqlfluff b/.trunk/configs/.sqlfluff new file mode 100644 index 00000000..de7aacd3 --- /dev/null +++ b/.trunk/configs/.sqlfluff @@ -0,0 +1,30 @@ +[sqlfluff] +dialect = postgres +exclude_rules = LT02, LT09 + +[sqlfluff:indentation] +indented_ctes = True + +[sqlfluff:layout:type:colon] +spacing_before = single +spacing_after = single + +[sqlfluff:layout:type:parameter] +spacing_before = touch +spacing_after = any + +[sqlfluff:rules:references.special_chars] +allow_space_in_identifier = True +additional_allowed_characters = ["/", "_", "-", "(", ")"] + +[sqlfluff:rules:capitalisation.keywords] +capitalisation_policy = upper + +[sqlfluff:rules:capitalisation.identifiers] +extended_capitalisation_policy = lower + +[sqlfluff:rules:capitalisation.functions] +extended_capitalisation_policy = upper + +[sqlfluff:rules:capitalisation.types] +extended_capitalisation_policy = upper diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index 6d746ff9..31ff5439 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -5,6 +5,14 @@ version: 0.1 cli: version: 1.22.0 +tools: + definitions: + - name: sqlfluff + runtime: python + package: sqlfluff + shims: [sqlfluff] + known_good_version: 1.4.5 + # Trunk provides extensibility via plugins. # (https://docs.trunk.io/plugins) plugins: @@ -27,6 +35,7 @@ lint: disabled: - hadolint - oxipng + definitions: - name: bandit direct_configs: [bandit.yaml] @@ -34,6 +43,45 @@ lint: - name: lint run: bandit --exit-zero -c bandit.yaml --format json --output ${tmpfile} ${target} + - name: sqlfluff + files: [sql, sql-j2, dml, ddl] + tools: [sqlfluff] + description: A dialect-flexible and configurable SQL linter + known_good_version: 1.4.5 + direct_configs: + - .sqlfluff + affects_cache: + - pyproject.toml + suggest_if: config_present + commands: + - name: lint + run: sqlfluff lint ${target} --format json --nofail + output: sarif + success_codes: [0] + read_output_from: stdout + parser: + runtime: python + run: python3 ${plugin}/linters/sqlfluff/sqlfluff_to_sarif.py + + - name: fix + version: ">=3.0.0" + run: sqlfluff fix ${target} --disable-progress-bar + output: rewrite + formatter: true + in_place: true + success_codes: [0, 1] + enabled: false + batch: true + + - name: format + run: sqlfluff format ${target} --disable-progress-bar + output: rewrite + formatter: true + in_place: true + success_codes: [0, 1] + enabled: false + batch: true + ignore: - linters: [ALL] paths: @@ -45,6 +93,8 @@ lint: - LICENSE.md enabled: + - sqlfluff@3.2.5: + commands: [lint, fix, format] - actionlint@1.6.27 - bandit@1.7.8 - black@24.4.2 diff --git a/app/repository/document.py b/app/repository/document.py index 57579a26..73d19d47 100644 --- a/app/repository/document.py +++ b/app/repository/document.py @@ -1,8 +1,4 @@ -""" -Functions to support the documents endpoints - -old functions (non DFC) are moved to the deprecated_documents.py file. -""" +"""Database helper functions for the documents entity.""" import logging import os @@ -22,8 +18,9 @@ from db_client.models.dfce.metadata import FamilyMetadata from db_client.models.document.physical_document import PhysicalDocument from db_client.models.organisation.organisation import Organisation -from sqlalchemy import func +from sqlalchemy import bindparam, func, text from sqlalchemy.orm import Session +from sqlalchemy.types import ARRAY, String from app.models.document import ( CollectionOverviewResponse, @@ -42,22 +39,6 @@ _LOGGER = logging.getLogger(__file__) -def get_slugged_object_from_allowed_corpora_query( - template_query, slug_name: str, allowed_corpora_ids: list[str] -) -> str: - """Create download whole database query, replacing variables. - - :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] allowed_corpora_ids: The corpora from which we - should allow the data to be dumped. - :return str: The SQL query to perform on the database session. - """ - corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" - return template_query.replace("{slug_name}", slug_name).replace( # type: ignore - "{allowed_corpora_ids}", corpora_ids - ) # type: ignore - - def get_slugged_objects( db: Session, slug: str, allowed_corpora: Optional[list[str]] = None ) -> tuple[Optional[str], Optional[str]]: @@ -74,14 +55,22 @@ def get_slugged_objects( :return tuple[Optional[str], Optional[str]]: the FamilyDocument import id or the Family import_id. """ - if allowed_corpora is not None: - query_template = get_query_template( - os.path.join("app", "repository", "sql", "slug_lookup.sql") + if allowed_corpora not in [None, []]: + query_template = text( + get_query_template( + os.path.join("app", "repository", "sql", "slug_lookup.sql") + ) + ) + + query_template = query_template.bindparams( + bindparam("slug_name", type_=String), + bindparam( + "allowed_corpora_ids", value=allowed_corpora, type_=ARRAY(String) + ), ) - query = get_slugged_object_from_allowed_corpora_query( - query_template, slug, allowed_corpora + query = db.execute( + query_template, {"slug_name": slug, "allowed_corpora_ids": allowed_corpora} ) - query = db.execute(query) else: query = db.query(Slug.family_document_import_id, Slug.family_import_id).filter( Slug.name == slug diff --git a/app/repository/download.py b/app/repository/download.py index 1ed90396..33592cc0 100644 --- a/app/repository/download.py +++ b/app/repository/download.py @@ -5,6 +5,8 @@ import pandas as pd from fastapi import Depends +from sqlalchemy import bindparam, text +from sqlalchemy.types import ARRAY, DATETIME, String from app.clients.db.session import get_db from app.repository.helpers import get_query_template @@ -12,32 +14,34 @@ _LOGGER = getLogger(__name__) -def create_query( - template_query, ingest_cycle_start: str, allowed_corpora_ids: list[str] -) -> str: - """Create download whole database query, replacing variables. +def get_whole_database_dump( + ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) +): + """Get whole database dump and bind variables. :param str ingest_cycle_start: The current ingest cycle date. - :param list[str] allowed_corpora_ids: The corpora from which we + :param list[str] corpora_ids: The corpora from which we should allow the data to be dumped. - :return str: The SQL query to perform on the database session. + :return pd.DataFrame: A DataFrame containing the results of the SQL + query that gets the whole database dump in our desired format. """ - corpora_ids = "'" + "','".join(allowed_corpora_ids) + "'" - return template_query.replace( # type: ignore - "{ingest_cycle_start}", ingest_cycle_start - ).replace( - "{allowed_corpora_ids}", corpora_ids - ) # type: ignore - - -def get_whole_database_dump( - ingest_cycle_start: str, allowed_corpora_ids: list[str], db=Depends(get_db) -): - query_template = get_query_template( - os.path.join("app", "repository", "sql", "download.sql") + query = text( + get_query_template(os.path.join("app", "repository", "sql", "download.sql")) + ).bindparams( + bindparam("ingest_cycle_start", type_=DATETIME), + bindparam( + "allowed_corpora_ids", value=allowed_corpora_ids, type_=ARRAY(String) + ), ) - query = create_query(query_template, ingest_cycle_start, allowed_corpora_ids) with db.connection() as conn: - df = pd.read_sql(query, conn.connection) + result = conn.execute( + query, + { + "ingest_cycle_start": ingest_cycle_start, + "allowed_corpora_ids": allowed_corpora_ids, + }, + ) + columns = result.keys() + df = pd.DataFrame(result.fetchall(), columns=columns) return df diff --git a/app/repository/helpers.py b/app/repository/helpers.py index e976683b..958e0b38 100644 --- a/app/repository/helpers.py +++ b/app/repository/helpers.py @@ -1,8 +1,4 @@ -""" -Functions to support the documents endpoints - -old functions (non DFC) are moved to the deprecated_documents.py file. -""" +"""Helper functions for the repository layer.""" from functools import lru_cache diff --git a/app/repository/sql/download.sql b/app/repository/sql/download.sql index 8a807080..15bbf8ac 100644 --- a/app/repository/sql/download.sql +++ b/app/repository/sql/download.sql @@ -1,243 +1,355 @@ -WITH -deduplicated_family_slugs as ( - SELECT - distinct ON (slug.family_import_id) - slug.family_import_id, slug.created, slug.name - FROM ( - SELECT - slug.family_import_id as "family_import_id", - count(*) as count - FROM slug - WHERE slug.family_import_id is not null - group by slug.family_import_id - having count(*) > 1 - ) duplicates - left join slug - on duplicates.family_import_id = slug.family_import_id - order by slug.family_import_id desc, slug.created desc, slug.ctid desc -), -unique_family_slugs as ( - SELECT - distinct ON (slug.family_import_id) - slug.family_import_id, slug.created, slug.name - FROM ( - SELECT - slug.family_import_id as "family_import_id", - count(*) as count - FROM slug - WHERE slug.family_import_id is not null - group by slug.family_import_id - having count(*) = 1 - ) non_duplicates - left join slug - on non_duplicates.family_import_id = slug.family_import_id - order by slug.family_import_id desc, slug.created desc, slug.ctid desc - ), most_recent_family_slugs as ( - SELECT - deduplicated_family_slugs.family_import_id as "family_import_id", - deduplicated_family_slugs.created as "created", - deduplicated_family_slugs.name as "name" - FROM deduplicated_family_slugs - UNION ALL - SELECT - unique_family_slugs.family_import_id as "family_import_id", - unique_family_slugs.created as "created", - unique_family_slugs.name as "name" - FROM unique_family_slugs - order by family_import_id desc, created desc - ), deduplicated_doc_slugs as ( - SELECT - distinct ON (slug.family_document_import_id) - slug.family_document_import_id, - slug.created, - slug.name - FROM ( - SELECT - slug.family_document_import_id as "family_document_import_id", - count(*) as count - FROM slug - WHERE slug.family_document_import_id is not null - group by slug.family_document_import_id - having count(*) > 1 - ) duplicates - left join slug - on duplicates.family_document_import_id = slug.family_document_import_id - order by - slug.family_document_import_id desc, slug.created desc, slug.ctid desc -), -unique_doc_slugs as ( - SELECT - distinct ON (slug.family_document_import_id) - slug.family_document_import_id, - slug.created, - slug.name - FROM ( - SELECT - slug.family_document_import_id as "family_document_import_id", - count(*) as count - FROM slug - WHERE slug.family_document_import_id is not null - group by slug.family_document_import_id - having count(*) = 1 - ) non_duplicates - left join slug - on non_duplicates.family_document_import_id = slug.family_document_import_id - order by - slug.family_document_import_id desc, slug.created desc, slug.ctid desc - ), most_recent_doc_slugs as ( - SELECT - deduplicated_doc_slugs.family_document_import_id - as "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM deduplicated_doc_slugs - UNION ALL - SELECT - unique_doc_slugs.family_document_import_id as "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM unique_doc_slugs - order by family_document_import_id desc, created desc - ), event_dates as ( - SELECT - family_event.family_import_id AS family_import_id, - CASE - WHEN COUNT(*) FILTER ( - WHERE family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - ) > 0 THEN - MIN(CASE - WHEN family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - THEN family_event.date::TIMESTAMPTZ - END) - ELSE - MIN(family_event.date::TIMESTAMPTZ) - END AS published_date, - max(family_event.date::date) last_changed - FROM - family_event - GROUP BY - family_import_id -) +WITH deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN slug ON duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +most_recent_family_slugs AS ( + SELECT + deduplicated_family_slugs.family_import_id, + deduplicated_family_slugs.created, + deduplicated_family_slugs.name + FROM + deduplicated_family_slugs + UNION ALL + SELECT + unique_family_slugs.family_import_id, + unique_family_slugs.created, + unique_family_slugs.name + FROM + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC + ), + +deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN + slug + ON + duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON + non_duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC + ), + +most_recent_doc_slugs AS ( + SELECT + deduplicated_doc_slugs.family_document_import_id, + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name + FROM + deduplicated_doc_slugs + UNION ALL + SELECT + unique_doc_slugs.family_document_import_id, + unique_doc_slugs.created, + unique_doc_slugs.name + FROM + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC + ), + +event_dates AS ( + SELECT + family_event.family_import_id, + CASE + WHEN COUNT(*) FILTER ( + WHERE + family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) + ) > 0 THEN MIN( + CASE + WHEN family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) THEN family_event.date::TIMESTAMPTZ + END + ) + ELSE MIN(family_event.date::TIMESTAMPTZ) + END AS published_date, + MAX(family_event.date::DATE) AS last_changed + FROM + family_event + GROUP BY + family_event.family_import_id + ), + +fg AS ( + SELECT + family_geography.family_import_id, + STRING_AGG(geography.value, ';') AS geo_isos, + STRING_AGG(geography.display_value, ';') AS geo_display_values + FROM + geography + INNER JOIN + family_geography + ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id + ), + +n1 AS ( + SELECT + collection_family.family_import_id, + STRING_AGG(collection.import_id, ';') AS collection_import_ids, + STRING_AGG(collection.title, ';') AS collection_titles, + STRING_AGG(collection.description, ';') AS collection_descriptions + FROM + collection + INNER JOIN + collection_family + ON collection.import_id = collection_family.collection_import_id + GROUP BY + collection_family.family_import_id + ) + SELECT -ds.name as "Document ID", -p.title as "Document Title", -fs.name as "Family ID", -f.title as "Family Title", -f.description as "Family Summary", -n1.collection_titles as "Collection Title(s)", -n1.collection_descriptions as "Collection Description(s)", -INITCAP(d.valid_metadata::json#>>'{ - role,0}') as -"Document Role", -d.variant_name as "Document Variant", -p.source_url as "Document Content URL", -INITCAP(d.valid_metadata::json#>>'{ - type,0}') as -"Document Type", -CASE - WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' - ELSE INITCAP(f.family_category::TEXT) -END "Category", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'framework')), ';') -as "Framework", -n2.language as "Language", -o.name as "Source", -fg.geo_isos as "Geography ISOs", -fg.geo_display_values as "Geographies", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'topic')), ';') -as "Topic/Response", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'hazard')), ';') -as "Hazard", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'sector')), ';') -as "Sector", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'keyword')), ';') -as "Keyword", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'instrument')), ';') -as "Instrument", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'author')), ';') -as "Author", -array_to_string(ARRAY( - SELECT jsonb_array_elements_text(fm.value->'author_type')), ';') -as "Author Type", -fp.published_date as "First event in timeline", -fp.last_changed as "Last event in timeline", -n3.event_type_names as "Full timeline of events (types)", -n3.event_dates as "Full timeline of events (dates)", -d.created::date as "Date Added to System", -f.last_modified::date as "Last ModIFied on System", -d.import_id as "Internal Document ID", -f.import_id as "Internal Family ID", -n1.collection_import_ids as "Internal Collection ID(s)" -FROM physical_document p -JOIN family_document d -ON p.id = d.physical_document_id -JOIN family f -ON d.family_import_id = f.import_id -FULL JOIN ( - SELECT - family_geography.family_import_id as "family_import_id", - string_agg(geography.value, ';') AS geo_isos, - string_agg(geography.display_value, ';') AS geo_display_values - FROM - geography - INNER JOIN family_geography - ON geography.id = family_geography.geography_id - GROUP BY family_geography.family_import_id -) fg ON fg.family_import_id=f.import_id -join family_corpus fc -on f.import_id = fc.family_import_id -join corpus c -on fc.corpus_import_id = c.import_id -join organisation o -on c.organisation_id = o.id -join family_metadata fm -on fm.family_import_id = f.import_id -FULL JOIN ( - SELECT - collection_family.family_import_id as "family_import_id", - string_agg(collection.import_id, ';') AS collection_import_ids, - string_agg(collection.title, ';') AS collection_titles, - string_agg(collection.description, ';') AS collection_descriptions - FROM - collection - INNER JOIN collection_family - ON collection_family.collection_import_id = collection.import_id - GROUP BY collection_family.family_import_id -) n1 ON n1.family_import_id=f.import_id -left JOIN ( - SELECT - p.id as "id", - string_agg(l.name, ';' ORDER BY l.name) AS language - FROM physical_document p - left join physical_document_language pdl - on pdl.document_id = p.id - left join language l - on l.id = pdl.language_id - GROUP BY p.id -) n2 ON n2.id=d.physical_document_id -FULL JOIN ( - SELECT - family_event.family_import_id, - string_agg(family_event.import_id, ';') AS event_import_ids, - string_agg(family_event.title, ';') AS event_titles, - string_agg(family_event.event_type_name, ';') AS event_type_names, - string_agg(family_event.date::date::text, ';') AS event_dates - FROM family_event - INNER JOIN family ON family.import_id = family_event.family_import_id - GROUP BY family_event.family_import_id -) n3 ON n3.family_import_id=f.import_id -LEFT JOIN most_recent_doc_slugs ds -on ds.family_document_import_id = d.import_id -LEFT JOIN most_recent_family_slugs fs on fs.family_import_id = f.import_id -LEFT JOIN event_dates fp on fp.family_import_id = f.import_id -WHERE d.last_modified < '{ingest_cycle_start}' AND fc.corpus_import_id in ({allowed_corpora_ids}) -ORDER BY d.last_modified desc, d.created desc, d.ctid desc, n1.family_import_id + ds.name AS "Document ID", + p.title AS "Document Title", + fs.name AS "Family ID", + f.title AS "Family Title", + f.description AS "Family Summary", + n1.collection_titles AS "Collection Title(s)", + n1.collection_descriptions AS "Collection Description(s)", + d.variant_name AS "Document Variant", + p.source_url AS "Document Content URL", + language_agg.display_name AS "Language", + o.name AS "Source", + fg.geo_isos AS "Geography ISOs", + fg.geo_display_values AS "Geographies", + fp.published_date AS "First event in timeline", + fp.last_changed AS "Last event in timeline", + n3.event_type_names AS "Full timeline of events (types)", + n3.event_dates AS "Full timeline of events (dates)", + d.created::DATE AS "Date Added to System", + f.last_modified::DATE AS "Last ModIFied on System", + d.import_id AS "Internal Document ID", + f.import_id AS "Internal Family ID", + n1.collection_import_ids AS "Internal Collection ID(s)", + INITCAP(d.valid_metadata::JSON #>> '{ + role,0}') AS "Document Role", + INITCAP(d.valid_metadata::JSON #>> '{ + type,0}') AS "Document Type", + CASE + WHEN f.family_category = 'UNFCCC' THEN 'UNFCCC' + ELSE INITCAP(f.family_category::TEXT) + END AS "Category", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'framework') + ), + ';' + ) AS "Framework", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'topic') + ), + ';' + ) AS "Topic/Response", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'hazard') + ), + ';' + ) AS "Hazard", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'sector') + ), + ';' + ) AS "Sector", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'keyword') + ), + ';' + ) AS "Keyword", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'instrument') + ), + ';' + ) AS "Instrument", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author') + ), + ';' + ) AS "Author", + ARRAY_TO_STRING( + ARRAY( + SELECT + JSONB_ARRAY_ELEMENTS_TEXT(fm.value -> 'author_type') + ), + ';' + ) AS "Author Type" +FROM + physical_document AS p + INNER JOIN family_document AS d ON p.id = d.physical_document_id + INNER JOIN family AS f ON d.family_import_id = f.import_id + FULL JOIN fg ON f.import_id = fg.family_import_id + INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id + INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id + INNER JOIN organisation AS o ON c.organisation_id = o.id + INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id + FULL JOIN n1 ON f.import_id = n1.family_import_id + LEFT JOIN ( + SELECT + p.id, + STRING_AGG( + l.name, + ';' + ORDER BY + l.name + ) AS display_name + FROM + physical_document AS p + LEFT JOIN + physical_document_language AS pdl + ON p.id = pdl.document_id + LEFT JOIN language AS l ON pdl.language_id = l.id + GROUP BY + p.id + ) AS language_agg ON d.physical_document_id = language_agg.id + FULL JOIN ( + SELECT + family_event.family_import_id, + STRING_AGG(family_event.import_id, ';') AS event_import_ids, + STRING_AGG(family_event.title, ';') AS event_titles, + STRING_AGG(family_event.event_type_name, ';') AS event_type_names, + STRING_AGG(family_event.date::DATE::TEXT, ';') AS event_dates + FROM + family_event + INNER JOIN + family + ON family_event.family_import_id = family.import_id + GROUP BY + family_event.family_import_id + ) AS n3 ON f.import_id = n3.family_import_id + LEFT JOIN + most_recent_doc_slugs AS ds + ON d.import_id = ds.family_document_import_id + LEFT JOIN + most_recent_family_slugs AS fs + ON f.import_id = fs.family_import_id + LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id +WHERE + d.last_modified < :ingest_cycle_start + AND fc.corpus_import_id = ANY(:allowed_corpora_ids) +ORDER BY + d.last_modified DESC, + d.created DESC, + d.ctid DESC, + n1.family_import_id ASC diff --git a/app/repository/sql/pipeline.sql b/app/repository/sql/pipeline.sql index af6023e6..7a5d0e40 100644 --- a/app/repository/sql/pipeline.sql +++ b/app/repository/sql/pipeline.sql @@ -1,36 +1,43 @@ -WITH deduplicated_family_slugs AS ( SELECT - DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name +WITH deduplicated_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) + slug.family_import_id, + slug.created, + slug.name FROM - ( SELECT - slug.family_import_id AS "family_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_import_id IS NOT NULL - GROUP BY - slug.family_import_id - HAVING - Count(*) > 1 ) duplicates - left join + ( + SELECT + slug.family_import_id, + COUNT(*) AS count + FROM + slug + WHERE + slug.family_import_id IS NOT NULL + GROUP BY + slug.family_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN slug - ON duplicates.family_import_id = slug.family_import_id + ON duplicates.family_import_id = slug.family_import_id ORDER BY slug.family_import_id DESC, slug.created DESC, - slug.ctid DESC ), - unique_family_slugs AS ( SELECT - DISTINCT - ON (slug.family_import_id) slug.family_import_id, - slug.created, - slug.name - FROM - ( SELECT - slug.family_import_id AS "family_import_id", - Count(*) AS count + slug.ctid DESC +), + +unique_family_slugs AS ( + SELECT DISTINCT + ON (slug.family_import_id) + slug.family_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_import_id, + COUNT(*) AS count FROM slug WHERE @@ -38,219 +45,235 @@ WITH deduplicated_family_slugs AS ( SELECT GROUP BY slug.family_import_id HAVING - Count(*) = 1 ) non_duplicates - left join - slug - ON non_duplicates.family_import_id = slug.family_import_id - ORDER BY - slug.family_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - most_recent_family_slugs AS ( SELECT - deduplicated_family_slugs.family_import_id AS "family_import_id", - deduplicated_family_slugs.created AS "created", - deduplicated_family_slugs.name AS "name" - FROM - deduplicated_family_slugs - UNION - ALL SELECT - unique_family_slugs.family_import_id AS "family_import_id", - unique_family_slugs.created AS "created", - unique_family_slugs.name AS "name" + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON non_duplicates.family_import_id = slug.family_import_id + ORDER BY + slug.family_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +most_recent_family_slugs AS ( + SELECT + deduplicated_family_slugs.family_import_id, + deduplicated_family_slugs.created, + deduplicated_family_slugs.name + FROM + deduplicated_family_slugs + UNION ALL + SELECT + unique_family_slugs.family_import_id, + unique_family_slugs.created, + unique_family_slugs.name + FROM + unique_family_slugs + ORDER BY + family_import_id DESC, + created DESC +), + +deduplicated_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) + slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count FROM - unique_family_slugs - ORDER BY - family_import_id DESC, - created DESC ), deduplicated_doc_slugs AS ( SELECT - DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name + slug + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) > 1 + ) AS duplicates + LEFT JOIN + slug + ON + duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +unique_doc_slugs AS ( + SELECT DISTINCT + ON (slug.family_document_import_id) + slug.family_document_import_id, + slug.created, + slug.name + FROM + ( + SELECT + slug.family_document_import_id, + COUNT(*) AS count FROM - ( SELECT - slug.family_document_import_id AS "family_document_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - Count(*) > 1 ) duplicates - left join slug - ON duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - unique_doc_slugs AS ( SELECT - DISTINCT - ON (slug.family_document_import_id) slug.family_document_import_id, - slug.created, - slug.name - FROM - ( SELECT - slug.family_document_import_id AS "family_document_import_id", - Count(*) AS count - FROM - slug - WHERE - slug.family_document_import_id IS NOT NULL - GROUP BY - slug.family_document_import_id - HAVING - Count(*) = 1 ) non_duplicates - left join - slug - ON non_duplicates.family_document_import_id = slug.family_document_import_id - ORDER BY - slug.family_document_import_id DESC, - slug.created DESC, - slug.ctid DESC ), - most_recent_doc_slugs AS ( - SELECT - deduplicated_doc_slugs.family_document_import_id AS "family_document_import_id", - deduplicated_doc_slugs.created, - deduplicated_doc_slugs.name - FROM - deduplicated_doc_slugs - UNION - ALL SELECT - unique_doc_slugs.family_document_import_id AS "family_document_import_id", - unique_doc_slugs.created, - unique_doc_slugs.name - FROM - unique_doc_slugs - ORDER BY - family_document_import_id DESC, - created DESC - ), event_dates AS ( - SELECT - family_event.family_import_id AS family_import_id, - CASE - WHEN COUNT(*) FILTER ( - WHERE family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - ) > 0 THEN - MIN(CASE - WHEN family_event.event_type_name = - (family_event.valid_metadata->'datetime_event_name'->>0) - THEN family_event.date::TIMESTAMPTZ - END) - ELSE - MIN(family_event.date::TIMESTAMPTZ) - END AS published_date - FROM - family_event - GROUP BY - family_import_id - ) SELECT - f.title AS "family_title", - p.title AS "physical_document_title", - f.description AS "family_description", - CASE - WHEN f.family_category IN ('UNFCCC', - 'MCF') THEN Upper(f.family_category::text) - ELSE Initcap(f.family_category::text) - END "family_category", - fp.published_date AS "family_published_date", - d.import_id AS "family_document_import_id", - ds.name AS "family_document_slug", - f.import_id AS "family_import_id", - fs.name AS "family_slug", - p.source_url AS "physical_document_source_url", - d.valid_metadata::json#>>'{type,0}' AS "family_document_type", - o.name AS "organisation_name", - geos.geographies AS "geographies", - c.import_id AS "corpus_import_id", - c.corpus_type_name AS "corpus_type_name", - langs.languages AS "languages", - fm.value AS "family_metadata", - d.valid_metadata AS "family_document_metadata" - FROM - physical_document p - join - family_document d - ON p.id = d.physical_document_id - join - family f - ON d.family_import_id = f.import_id full - join - ( - SELECT - family_geography.family_import_id AS "family_import_id", - string_agg(geography.value, - ';') AS geo_isos, - string_agg(geography.display_value, - ';') AS geo_display_values - FROM - geography - inner join - family_geography - ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) fg - ON fg.family_import_id=f.import_id - join - family_corpus fc - ON f.import_id = fc.family_import_id - join - corpus c - ON fc.corpus_import_id = c.import_id - join - organisation o - ON c.organisation_id = o.id - join - family_metadata fm - ON fm.family_import_id = f.import_id - left outer join - ( - SELECT - family_document.import_id AS family_document_import_id, - json_agg(DISTINCT(LANGUAGE.name)) AS languages - FROM - family_document - join - physical_document_language - ON physical_document_language.document_id = family_document.physical_document_id - join - LANGUAGE - ON LANGUAGE.id = physical_document_language.language_id - GROUP BY - family_document.import_id - ) AS langs - ON langs.family_document_import_id = d.import_id - left outer join - ( - SELECT - family_geography.family_import_id AS family_import_id, - json_agg(DISTINCT(geography.value)) AS geographies - FROM - family_geography - join - geography - ON geography.id = family_geography.geography_id - GROUP BY - family_geography.family_import_id - ) AS geos - ON geos.family_import_id = f.import_id - left join - most_recent_doc_slugs ds - ON ds.family_document_import_id = d.import_id - left join - most_recent_family_slugs fs - ON fs.family_import_id = f.import_id - left join - event_dates fp - ON fp.family_import_id = f.import_id + WHERE + slug.family_document_import_id IS NOT NULL + GROUP BY + slug.family_document_import_id + HAVING + COUNT(*) = 1 + ) AS non_duplicates + LEFT JOIN + slug + ON + non_duplicates.family_document_import_id + = slug.family_document_import_id + ORDER BY + slug.family_document_import_id DESC, + slug.created DESC, + slug.ctid DESC +), + +most_recent_doc_slugs AS ( + SELECT + deduplicated_doc_slugs.family_document_import_id, + deduplicated_doc_slugs.created, + deduplicated_doc_slugs.name + FROM + deduplicated_doc_slugs + UNION ALL + SELECT + unique_doc_slugs.family_document_import_id, + unique_doc_slugs.created, + unique_doc_slugs.name + FROM + unique_doc_slugs + ORDER BY + family_document_import_id DESC, + created DESC +), + +event_dates AS ( + SELECT + family_event.family_import_id, + CASE + WHEN + COUNT(*) FILTER ( WHERE - d.document_status != 'DELETED' - AND fg.family_import_id = f.import_id - ORDER BY - d.last_modified DESC, - d.created DESC, - d.ctid DESC, - f.import_id + family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) + ) > 0 + THEN MIN( + CASE + WHEN family_event.event_type_name = ( + family_event.valid_metadata + -> 'datetime_event_name' + ->> 0 + ) THEN family_event.date::TIMESTAMPTZ + END + ) + ELSE MIN(family_event.date::TIMESTAMPTZ) + END AS published_date + FROM + family_event + GROUP BY + family_event.family_import_id +), + +fg AS ( + SELECT + family_geography.family_import_id, + STRING_AGG(geography.value, ';') AS geo_isos, + STRING_AGG(geography.display_value, ';') AS geo_display_values + FROM + geography + INNER JOIN + family_geography + ON geography.id = family_geography.geography_id + GROUP BY + family_geography.family_import_id +), + +geos AS ( + SELECT + family_geography.family_import_id, + JSON_AGG(DISTINCT geography.value) AS geographies + FROM + family_geography + INNER JOIN geography ON family_geography.geography_id = geography.id + GROUP BY + family_geography.family_import_id +) + +SELECT + f.title AS family_title, + p.title AS physical_document_title, + f.description AS family_description, + fp.published_date AS family_published_date, + d.import_id AS family_document_import_id, + ds.name AS family_document_slug, + f.import_id AS family_import_id, + fs.name AS family_slug, + p.source_url AS physical_document_source_url, + o.name AS organisation_name, + geos.geographies, + c.import_id AS corpus_import_id, + c.corpus_type_name, + langs.languages, + fm.value AS family_metadata, + d.valid_metadata AS family_document_metadata, + CASE + WHEN + f.family_category IN ('UNFCCC', 'MCF') + THEN UPPER(f.family_category::TEXT) + ELSE INITCAP(f.family_category::TEXT) + END AS family_category, + d.valid_metadata::JSON #>> '{type,0}' AS family_document_type +FROM + physical_document AS p +INNER JOIN family_document AS d ON p.id = d.physical_document_id +INNER JOIN family AS f ON d.family_import_id = f.import_id +FULL JOIN fg ON f.import_id = fg.family_import_id +INNER JOIN family_corpus AS fc ON f.import_id = fc.family_import_id +INNER JOIN corpus AS c ON fc.corpus_import_id = c.import_id +INNER JOIN organisation AS o ON c.organisation_id = o.id +INNER JOIN family_metadata AS fm ON f.import_id = fm.family_import_id +LEFT OUTER JOIN ( + SELECT + family_document.import_id AS family_document_import_id, + JSON_AGG(DISTINCT language.name) AS languages + FROM + family_document + INNER JOIN + physical_document_language + ON + family_document.physical_document_id + = physical_document_language.document_id + INNER JOIN + language + ON physical_document_language.language_id = language.id + GROUP BY + family_document.import_id +) AS langs ON d.import_id = langs.family_document_import_id +LEFT OUTER JOIN geos ON f.import_id = geos.family_import_id +LEFT JOIN + most_recent_doc_slugs AS ds + ON d.import_id = ds.family_document_import_id +LEFT JOIN + most_recent_family_slugs AS fs + ON f.import_id = fs.family_import_id +LEFT JOIN event_dates AS fp ON f.import_id = fp.family_import_id +WHERE + d.document_status != 'DELETED' + AND fg.family_import_id = f.import_id +ORDER BY + d.last_modified DESC, + d.created DESC, + d.ctid DESC, + f.import_id ASC diff --git a/app/repository/sql/slug_lookup.sql b/app/repository/sql/slug_lookup.sql index 9d649067..09cb2e69 100644 --- a/app/repository/sql/slug_lookup.sql +++ b/app/repository/sql/slug_lookup.sql @@ -1,20 +1,33 @@ -SELECT - slug.family_document_import_id, slug.family_import_id +-- First query for family document slugs +SELECT DISTINCT + slug.family_document_import_id, + slug.family_import_id FROM slug -LEFT JOIN family ON family.import_id = slug.family_import_id -LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id -LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id -WHERE slug.name = '{slug_name}' -AND corpus.import_id IN ({allowed_corpora_ids}) + INNER JOIN family_document + ON slug.family_document_import_id = family_document.import_id + INNER JOIN family + ON family_document.family_import_id = family.import_id + INNER JOIN family_corpus + ON family.import_id = family_corpus.family_import_id + INNER JOIN corpus + ON family_corpus.corpus_import_id = corpus.import_id +WHERE + slug.name = :slug_name + AND corpus.import_id = ANY(:allowed_corpora_ids) UNION -SELECT - slug.family_document_import_id, slug.family_import_id +-- Second query for family slugs +SELECT DISTINCT + NULL AS family_document_import_id, + slug.family_import_id FROM slug -LEFT JOIN family_document ON family_document.import_id = slug.family_document_import_id -LEFT JOIN family ON family.import_id = family_document.family_import_id -LEFT JOIN family_corpus ON family_corpus.family_import_id = family.import_id -LEFT JOIN corpus ON corpus.import_id = family_corpus.corpus_import_id -WHERE slug.name = '{slug_name}' -AND corpus.import_id IN ({allowed_corpora_ids}); + INNER JOIN family + ON slug.family_import_id = family.import_id + INNER JOIN family_corpus + ON family.import_id = family_corpus.family_import_id + INNER JOIN corpus + ON family_corpus.corpus_import_id = corpus.import_id +WHERE + slug.name = :slug_name + AND corpus.import_id = ANY(:allowed_corpora_ids) diff --git a/makefile-docker.defs b/makefile-docker.defs index 276a67f0..b41a9358 100644 --- a/makefile-docker.defs +++ b/makefile-docker.defs @@ -123,7 +123,7 @@ test_non_search: docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv -m 'not search' ${ARGS} test: - docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv ${ARGS} + docker compose -f docker-compose.yml -f docker-compose.dev.yml run --rm backend pytest -vvv tests ${ARGS} # ---------------------------------- # tasks diff --git a/pyproject.toml b/pyproject.toml index e5569fe6..849d3b4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "navigator_backend" -version = "1.19.14" +version = "1.19.15" description = "" authors = ["CPR-dev-team "] packages = [{ include = "app" }, { include = "tests" }]