Merge pull request #33 from climatepolicyradar/feature/rnd-279-add-da…

…ta-access-library-to-the-pipeline-stages-for-type Use DAL across pipeline components
climatepolicyradar · Aug 22, 2023 · 591171d · 591171d
2 parents c3dfce4 + 51c5e6e
commit 591171d
Show file tree

Hide file tree

Showing 11 changed files with 375 additions and 45 deletions.
diff --git a/src/cpr_data_access/models.py b/src/cpr_data_access/models.py
@@ -33,9 +33,11 @@
 import cpr_data_access.data_adaptors as adaptors
 from cpr_data_access.parser_models import (
     ParserOutput,
+    BlockType,
+)
+from cpr_data_access.pipeline_general_models import (
     CONTENT_TYPE_HTML,
     CONTENT_TYPE_PDF,
-    BlockType,
 )
 
 LOGGER = logging.getLogger(__name__)
@@ -46,7 +48,11 @@
 def _load_and_validate_metadata_csv(
     metadata_csv_path: Path, target_model: type[AnyDocument]
 ) -> pd.DataFrame:
-    """Load a metadata CSV, raising a ValueError if it does not exist or doesn't have the expected columns."""
+    """
+    Load a metadata CSV
+
+    Raise a ValueError if it does not exist or doesn't have the expected columns.
+    """
     if not metadata_csv_path.exists():
         raise ValueError(f"metadata_csv_path {metadata_csv_path} does not exist")
 

diff --git a/src/cpr_data_access/parser_models.py b/src/cpr_data_access/parser_models.py
@@ -1,19 +1,20 @@
-"""A copy of src/base.py from navigator-document-parser, with methods that rely on external libraries removed. These may be duplicated in models.py, but the intention is that these stay in sync with the data pipeline so we can easily update the pipeline should we decide to use these new models."""
-
 import logging
 import logging.config
 from datetime import date
 from enum import Enum
-from typing import Optional, Sequence, Tuple, List, Union
+from typing import Optional, Sequence, Tuple, List, Union, Mapping, Any
 from collections import Counter
 from pydantic import BaseModel, AnyHttpUrl, Field, root_validator
 from langdetect import DetectorFactory, LangDetectException
 from langdetect import detect
 
-logger = logging.getLogger(__name__)
+from cpr_data_access.pipeline_general_models import (
+    CONTENT_TYPE_HTML,
+    CONTENT_TYPE_PDF,
+    BackendDocument,
+)
 
-CONTENT_TYPE_HTML = "text/html"
-CONTENT_TYPE_PDF = "application/pdf"
+logger = logging.getLogger(__name__)
 
 
 class BlockType(str, Enum):
@@ -44,11 +45,11 @@ class TextBlock(BaseModel):
     """
     Base class for a text block.
 
-    :attribute text: list of text lines contained in the text block
-    :attribute text_block_id: unique identifier for the text block
-    :attribute language: language of the text block. 2-letter ISO code, optional.
-    :attribute type: predicted type of the text block
-    :attribute type_confidence: confidence score of the text block being of the predicted type
+    :attribute text: list of text lines contained in the text block :attribute
+    text_block_id: unique identifier for the text block :attribute language: language
+    of the text block. 2-letter ISO code, optional. :attribute type: predicted type of
+    the text block :attribute type_confidence: confidence score of the text block
+    being of the predicted type
     """
 
     text: List[str]
@@ -60,7 +61,7 @@ class TextBlock(BaseModel):
     type_confidence: float = Field(ge=0, le=1)
 
     def to_string(self) -> str:
-        """Returns the lines in a text block as a string with the lines separated by spaces."""
+        """Returns lines in a text block separated by spaces as a string."""
 
         return " ".join([line.strip() for line in self.text])
 
@@ -69,7 +70,8 @@ class HTMLTextBlock(TextBlock):
     """
     Text block parsed from an HTML document.
 
-    Type is set to "Text" with a confidence of 1.0 by default, as we do not predict types for text blocks parsed from HTML.
+    Type is set to "Text" with a confidence of 1.0 by default, as we do not predict
+    types for text blocks parsed from HTML.
     """
 
     type: BlockType = BlockType.TEXT
@@ -80,19 +82,20 @@ class PDFTextBlock(TextBlock):
     """
     Text block parsed from a PDF document.
 
-    Stores the text and positional information for a single text block extracted from a document.
+    Stores the text and positional information for a single text block extracted from
+    a document.
 
-    :attribute coords: list of coordinates of the vertices defining the boundary of the text block.
-        Each coordinate is a tuple in the format (x, y). (0, 0) is at the top left corner of
-        the page, and the positive x- and y- directions are right and down.
-    :attribute page_number: page number of the page containing the text block.
+    :attribute coords: list of coordinates of the vertices defining the boundary of
+    the text block. Each coordinate is a tuple in the format (x, y). (0, 0) is at the
+    top left corner of the page, and the positive x- and y- directions are right and
+    down. :attribute page_number: page number of the page containing the text block.
     """
 
     coords: List[Tuple[float, float]]
     page_number: int = Field(ge=0)
 
     def to_string(self) -> str:
-        """Returns the lines in a text block as a string with the lines separated by spaces."""
+        """Returns lines in a text block separated by spaces as a string."""
 
         return " ".join([line.strip() for line in self.text])
 
@@ -101,7 +104,7 @@ class ParserInput(BaseModel):
     """Base class for input to a parser."""
 
     document_id: str
-    document_metadata: dict
+    document_metadata: BackendDocument
     document_name: str
     document_description: str
     document_source_url: Optional[AnyHttpUrl]
@@ -110,6 +113,20 @@ class ParserInput(BaseModel):
     document_md5_sum: Optional[str]
     document_slug: str
 
+    def to_json(self) -> Mapping[str, Any]:
+        """Output a JSON serialising friendly dict representing this model"""
+        return {
+            "document_name": self.document_name,
+            "document_description": self.document_description,
+            "document_id": self.document_id,
+            "document_source_url": self.document_source_url,
+            "document_cdn_object": self.document_cdn_object,
+            "document_content_type": self.document_content_type,
+            "document_md5_sum": self.document_md5_sum,
+            "document_metadata": self.document_metadata.to_json(),
+            "document_slug": self.document_slug,
+        }
+
 
 class HTMLData(BaseModel):
     """Set of metadata specific to HTML documents."""
@@ -135,10 +152,10 @@ class PDFData(BaseModel):
     """
     Set of metadata unique to PDF documents.
 
-    :attribute pages: List of pages contained in the document
-    :attribute filename: Name of the PDF file, without extension
-    :attribute md5sum: md5sum of PDF content
-    :attribute language: list of 2-letter ISO language codes, optional. If null, the OCR processor didn't support language detection
+    :attribute pages: List of pages contained in the document :attribute filename:
+    Name of the PDF file, without extension :attribute md5sum: md5sum of PDF content
+    :attribute language: list of 2-letter ISO language codes, optional. If null,
+    the OCR processor didn't support language detection
     """
 
     page_metadata: Sequence[PDFPageMetadata]
@@ -150,7 +167,7 @@ class ParserOutput(BaseModel):
     """Base class for an output to a parser."""
 
     document_id: str
-    document_metadata: dict
+    document_metadata: BackendDocument
     document_name: str
     document_description: str
     document_source_url: Optional[AnyHttpUrl]
@@ -197,6 +214,14 @@ def check_html_pdf_metadata(cls, values):
 
         return values
 
+    def get_text_blocks(self, including_invalid_html=False):
+        """A method for getting text blocks with the option to include invalid html."""
+        if self.document_content_type == CONTENT_TYPE_HTML and self.html_data:
+            if not including_invalid_html and not self.html_data.has_valid_text:
+                return []
+            else:
+                return self.text_blocks
+
     @property
     def text_blocks(self) -> Sequence[TextBlock]:
         """
@@ -284,3 +309,35 @@ def set_document_languages_from_text_blocks(
             ]
 
         return self
+
+    def vertically_flip_text_block_coords(self) -> "ParserOutput":
+        """
+        Flips the coordinates of all PDF text blocks vertically.
+
+        Acts in-place on the coordinates in the ParserOutput object.
+        """
+
+        if self.pdf_data is None:
+            return self
+
+        page_height_map = {
+            page.page_number: page.dimensions[1] for page in self.pdf_data.page_metadata
+        }
+
+        for text_block in self.pdf_data.text_blocks:
+            if text_block.coords is not None and text_block.page_number is not None:
+                text_block.coords = [
+                    (x, page_height_map[text_block.page_number] - y)
+                    for x, y in text_block.coords
+                ]
+
+                # flip top and bottom so y values are still increasing as you go
+                # through the coordinates list
+                text_block.coords = [
+                    text_block.coords[3],
+                    text_block.coords[2],
+                    text_block.coords[1],
+                    text_block.coords[0],
+                ]
+
+        return self
diff --git a/src/cpr_data_access/pipeline_general_models.py b/src/cpr_data_access/pipeline_general_models.py
@@ -0,0 +1,107 @@
+from datetime import datetime
+from enum import Enum
+from typing import Mapping, Any, List, Optional, Sequence, Union
+
+from pydantic import BaseModel, root_validator
+
+Json = dict[str, Any]
+
+CONTENT_TYPE_HTML = "text/html"
+CONTENT_TYPE_DOCX = (
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+)
+CONTENT_TYPE_PDF = "application/pdf"
+
+
+class BackendDocument(BaseModel):
+    """
+    A representation of all information expected to be provided for a document.
+
+    This class comprises direct information describing a document, along
+    with all metadata values that should be associated with that document.
+    """
+
+    name: str
+    description: str
+    import_id: str
+    family_import_id: str
+    slug: str
+    publication_ts: datetime
+    date: Optional[str] = None  # Set on import by a validator
+    source_url: Optional[str]
+    download_url: Optional[str]
+
+    type: str
+    source: str
+    category: str
+    geography: str
+    languages: Sequence[str]
+
+    metadata: Json
+
+    @root_validator
+    def convert_publication_ts_to_date(cls, values):
+        """
+        Convert publication_ts to a datetime string.
+
+        This is necessary as OpenSearch expects a date object.
+        """
+
+        values["date"] = values["publication_ts"].strftime("%d/%m/%Y")
+
+        return values
+
+    def to_json(self) -> Mapping[str, Any]:
+        """Output a JSON serialising friendly dict representing this model."""
+        json_dict = self.dict()
+        json_dict["publication_ts"] = self.publication_ts.isoformat()
+        return json_dict
+
+
+class InputData(BaseModel):
+    """Expected input data containing RDS state."""
+
+    documents: Mapping[str, BackendDocument]
+
+
+class UpdateTypes(str, Enum):
+    """Document types supported by the backend API."""
+
+    NAME = "name"
+    DESCRIPTION = "description"
+    # IMPORT_ID = "import_id"
+    # SLUG = "slug"
+    # PUBLICATION_TS = "publication_ts"
+    SOURCE_URL = "source_url"
+    # TYPE = "type"
+    # SOURCE = "source"
+    # CATEGORY = "category"
+    # GEOGRAPHY = "geography"
+    # LANGUAGES = "languages"
+    # DOCUMENT_STATUS = "document_status"
+    METADATA = "metadata"
+
+
+class Update(BaseModel):
+    """Results of comparing db state data against the s3 data to identify updates."""
+
+    s3_value: Optional[Union[str, datetime, dict]]
+    db_value: Union[str, datetime, dict]
+    type: UpdateTypes
+
+
+class PipelineUpdates(BaseModel):
+    """
+    Expected input data containing document updates and new documents.
+
+    This is utilized by the ingest stage of the pipeline.
+    """
+
+    new_documents: List[BackendDocument]
+    updated_documents: dict[str, List[Update]]
+
+
+class ExecutionData(BaseModel):
+    """Data unique to a step functions execution that is required at later stages."""
+
+    input_dir_path: str
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -36,7 +36,36 @@ def s3_client():
 
 
 @pytest.fixture()
-def parser_output_json() -> dict:
+def parser_output_json_pdf() -> dict:
     """A dictionary representation of a parser output"""
     with open("tests/test_data/valid/test_pdf.json") as f:
         return json.load(f)
+
+
+@pytest.fixture()
+def parser_output_json_html() -> dict:
+    """A dictionary representation of a parser output"""
+    with open("tests/test_data/valid/test_html.json") as f:
+        return json.load(f)
+
+
+@pytest.fixture()
+def backend_document_json() -> dict:
+    """A dictionary representation of a backend document"""
+    return {
+        "name": "test_name",
+        "description": "test_description",
+        "import_id": "test_import_id",
+        "family_import_id": "test_family_import_id",
+        "slug": "test_slug",
+        "publication_ts": "2021-01-01T00:00:00+00:00",
+        "date": "01/01/2021",
+        "source_url": "test_source_url",
+        "download_url": "test_download_url",
+        "type": "test_type",
+        "source": "test_source",
+        "category": "test_category",
+        "geography": "test_geography",
+        "languages": ["test_language"],
+        "metadata": {"test_metadata": "test_value"},
+    }
diff --git a/tests/test_data/invalid/test_html.json b/tests/test_data/invalid/test_html.json
@@ -8,6 +8,27 @@
     "languages": [
         "en"
     ],
+        "document_metadata": {
+        "name": "test_pdf",
+        "description": "test_pdf_description",
+        "import_id": "CCLW.executive.1003.0",
+        "family_import_id": "CCLW.executive.1003",
+        "slug": "test_pdf",
+        "publication_ts": "2022-10-25 12:43:00.869045",
+        "source_url": "https://cdn.climatepolicyradar.org/EUR/2013/EUR-2013-01-01-Overview+of+CAP+Reform+2014-2020_6237180d8c443d72c06c9167019ca177.pdf",
+        "download_url": "https://cdn.climatepolicyradar.org/EUR/2013/EUR-2013-01-01-Overview+of+CAP+Reform+2014-2020_6237180d8c443d72c06c9167019ca177.pdf",
+        "geography": "test_geo",
+        "category": "test_category",
+        "source": "test_source",
+        "type": "test_type",
+        "sectors": ["sector1", "sector2"],
+        "languages": [
+            "en"
+        ],
+        "metadata": {
+            "test_key": "test_value"
+        }
+    },
     "translated": false,
     "document_slug": "YYY",
     "document_content_type": "text/html",