Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
Merge pull request #33 from climatepolicyradar/feature/rnd-279-add-da…
Browse files Browse the repository at this point in the history
…ta-access-library-to-the-pipeline-stages-for-type

Use DAL across pipeline components
  • Loading branch information
THOR300 authored Aug 22, 2023
2 parents c3dfce4 + 51c5e6e commit 591171d
Show file tree
Hide file tree
Showing 11 changed files with 375 additions and 45 deletions.
10 changes: 8 additions & 2 deletions src/cpr_data_access/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,11 @@
import cpr_data_access.data_adaptors as adaptors
from cpr_data_access.parser_models import (
ParserOutput,
BlockType,
)
from cpr_data_access.pipeline_general_models import (
CONTENT_TYPE_HTML,
CONTENT_TYPE_PDF,
BlockType,
)

LOGGER = logging.getLogger(__name__)
Expand All @@ -46,7 +48,11 @@
def _load_and_validate_metadata_csv(
metadata_csv_path: Path, target_model: type[AnyDocument]
) -> pd.DataFrame:
"""Load a metadata CSV, raising a ValueError if it does not exist or doesn't have the expected columns."""
"""
Load a metadata CSV
Raise a ValueError if it does not exist or doesn't have the expected columns.
"""
if not metadata_csv_path.exists():
raise ValueError(f"metadata_csv_path {metadata_csv_path} does not exist")

Expand Down
107 changes: 82 additions & 25 deletions src/cpr_data_access/parser_models.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
"""A copy of src/base.py from navigator-document-parser, with methods that rely on external libraries removed. These may be duplicated in models.py, but the intention is that these stay in sync with the data pipeline so we can easily update the pipeline should we decide to use these new models."""

import logging
import logging.config
from datetime import date
from enum import Enum
from typing import Optional, Sequence, Tuple, List, Union
from typing import Optional, Sequence, Tuple, List, Union, Mapping, Any
from collections import Counter
from pydantic import BaseModel, AnyHttpUrl, Field, root_validator
from langdetect import DetectorFactory, LangDetectException
from langdetect import detect

logger = logging.getLogger(__name__)
from cpr_data_access.pipeline_general_models import (
CONTENT_TYPE_HTML,
CONTENT_TYPE_PDF,
BackendDocument,
)

CONTENT_TYPE_HTML = "text/html"
CONTENT_TYPE_PDF = "application/pdf"
logger = logging.getLogger(__name__)


class BlockType(str, Enum):
Expand Down Expand Up @@ -44,11 +45,11 @@ class TextBlock(BaseModel):
"""
Base class for a text block.
:attribute text: list of text lines contained in the text block
:attribute text_block_id: unique identifier for the text block
:attribute language: language of the text block. 2-letter ISO code, optional.
:attribute type: predicted type of the text block
:attribute type_confidence: confidence score of the text block being of the predicted type
:attribute text: list of text lines contained in the text block :attribute
text_block_id: unique identifier for the text block :attribute language: language
of the text block. 2-letter ISO code, optional. :attribute type: predicted type of
the text block :attribute type_confidence: confidence score of the text block
being of the predicted type
"""

text: List[str]
Expand All @@ -60,7 +61,7 @@ class TextBlock(BaseModel):
type_confidence: float = Field(ge=0, le=1)

def to_string(self) -> str:
"""Returns the lines in a text block as a string with the lines separated by spaces."""
"""Returns lines in a text block separated by spaces as a string."""

return " ".join([line.strip() for line in self.text])

Expand All @@ -69,7 +70,8 @@ class HTMLTextBlock(TextBlock):
"""
Text block parsed from an HTML document.
Type is set to "Text" with a confidence of 1.0 by default, as we do not predict types for text blocks parsed from HTML.
Type is set to "Text" with a confidence of 1.0 by default, as we do not predict
types for text blocks parsed from HTML.
"""

type: BlockType = BlockType.TEXT
Expand All @@ -80,19 +82,20 @@ class PDFTextBlock(TextBlock):
"""
Text block parsed from a PDF document.
Stores the text and positional information for a single text block extracted from a document.
Stores the text and positional information for a single text block extracted from
a document.
:attribute coords: list of coordinates of the vertices defining the boundary of the text block.
Each coordinate is a tuple in the format (x, y). (0, 0) is at the top left corner of
the page, and the positive x- and y- directions are right and down.
:attribute page_number: page number of the page containing the text block.
:attribute coords: list of coordinates of the vertices defining the boundary of
the text block. Each coordinate is a tuple in the format (x, y). (0, 0) is at the
top left corner of the page, and the positive x- and y- directions are right and
down. :attribute page_number: page number of the page containing the text block.
"""

coords: List[Tuple[float, float]]
page_number: int = Field(ge=0)

def to_string(self) -> str:
"""Returns the lines in a text block as a string with the lines separated by spaces."""
"""Returns lines in a text block separated by spaces as a string."""

return " ".join([line.strip() for line in self.text])

Expand All @@ -101,7 +104,7 @@ class ParserInput(BaseModel):
"""Base class for input to a parser."""

document_id: str
document_metadata: dict
document_metadata: BackendDocument
document_name: str
document_description: str
document_source_url: Optional[AnyHttpUrl]
Expand All @@ -110,6 +113,20 @@ class ParserInput(BaseModel):
document_md5_sum: Optional[str]
document_slug: str

def to_json(self) -> Mapping[str, Any]:
"""Output a JSON serialising friendly dict representing this model"""
return {
"document_name": self.document_name,
"document_description": self.document_description,
"document_id": self.document_id,
"document_source_url": self.document_source_url,
"document_cdn_object": self.document_cdn_object,
"document_content_type": self.document_content_type,
"document_md5_sum": self.document_md5_sum,
"document_metadata": self.document_metadata.to_json(),
"document_slug": self.document_slug,
}


class HTMLData(BaseModel):
"""Set of metadata specific to HTML documents."""
Expand All @@ -135,10 +152,10 @@ class PDFData(BaseModel):
"""
Set of metadata unique to PDF documents.
:attribute pages: List of pages contained in the document
:attribute filename: Name of the PDF file, without extension
:attribute md5sum: md5sum of PDF content
:attribute language: list of 2-letter ISO language codes, optional. If null, the OCR processor didn't support language detection
:attribute pages: List of pages contained in the document :attribute filename:
Name of the PDF file, without extension :attribute md5sum: md5sum of PDF content
:attribute language: list of 2-letter ISO language codes, optional. If null,
the OCR processor didn't support language detection
"""

page_metadata: Sequence[PDFPageMetadata]
Expand All @@ -150,7 +167,7 @@ class ParserOutput(BaseModel):
"""Base class for an output to a parser."""

document_id: str
document_metadata: dict
document_metadata: BackendDocument
document_name: str
document_description: str
document_source_url: Optional[AnyHttpUrl]
Expand Down Expand Up @@ -197,6 +214,14 @@ def check_html_pdf_metadata(cls, values):

return values

def get_text_blocks(self, including_invalid_html=False):
"""A method for getting text blocks with the option to include invalid html."""
if self.document_content_type == CONTENT_TYPE_HTML and self.html_data:
if not including_invalid_html and not self.html_data.has_valid_text:
return []
else:
return self.text_blocks

@property
def text_blocks(self) -> Sequence[TextBlock]:
"""
Expand Down Expand Up @@ -284,3 +309,35 @@ def set_document_languages_from_text_blocks(
]

return self

def vertically_flip_text_block_coords(self) -> "ParserOutput":
"""
Flips the coordinates of all PDF text blocks vertically.
Acts in-place on the coordinates in the ParserOutput object.
"""

if self.pdf_data is None:
return self

page_height_map = {
page.page_number: page.dimensions[1] for page in self.pdf_data.page_metadata
}

for text_block in self.pdf_data.text_blocks:
if text_block.coords is not None and text_block.page_number is not None:
text_block.coords = [
(x, page_height_map[text_block.page_number] - y)
for x, y in text_block.coords
]

# flip top and bottom so y values are still increasing as you go
# through the coordinates list
text_block.coords = [
text_block.coords[3],
text_block.coords[2],
text_block.coords[1],
text_block.coords[0],
]

return self
107 changes: 107 additions & 0 deletions src/cpr_data_access/pipeline_general_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from datetime import datetime
from enum import Enum
from typing import Mapping, Any, List, Optional, Sequence, Union

from pydantic import BaseModel, root_validator

Json = dict[str, Any]

CONTENT_TYPE_HTML = "text/html"
CONTENT_TYPE_DOCX = (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
CONTENT_TYPE_PDF = "application/pdf"


class BackendDocument(BaseModel):
"""
A representation of all information expected to be provided for a document.
This class comprises direct information describing a document, along
with all metadata values that should be associated with that document.
"""

name: str
description: str
import_id: str
family_import_id: str
slug: str
publication_ts: datetime
date: Optional[str] = None # Set on import by a validator
source_url: Optional[str]
download_url: Optional[str]

type: str
source: str
category: str
geography: str
languages: Sequence[str]

metadata: Json

@root_validator
def convert_publication_ts_to_date(cls, values):
"""
Convert publication_ts to a datetime string.
This is necessary as OpenSearch expects a date object.
"""

values["date"] = values["publication_ts"].strftime("%d/%m/%Y")

return values

def to_json(self) -> Mapping[str, Any]:
"""Output a JSON serialising friendly dict representing this model."""
json_dict = self.dict()
json_dict["publication_ts"] = self.publication_ts.isoformat()
return json_dict


class InputData(BaseModel):
"""Expected input data containing RDS state."""

documents: Mapping[str, BackendDocument]


class UpdateTypes(str, Enum):
"""Document types supported by the backend API."""

NAME = "name"
DESCRIPTION = "description"
# IMPORT_ID = "import_id"
# SLUG = "slug"
# PUBLICATION_TS = "publication_ts"
SOURCE_URL = "source_url"
# TYPE = "type"
# SOURCE = "source"
# CATEGORY = "category"
# GEOGRAPHY = "geography"
# LANGUAGES = "languages"
# DOCUMENT_STATUS = "document_status"
METADATA = "metadata"


class Update(BaseModel):
"""Results of comparing db state data against the s3 data to identify updates."""

s3_value: Optional[Union[str, datetime, dict]]
db_value: Union[str, datetime, dict]
type: UpdateTypes


class PipelineUpdates(BaseModel):
"""
Expected input data containing document updates and new documents.
This is utilized by the ingest stage of the pipeline.
"""

new_documents: List[BackendDocument]
updated_documents: dict[str, List[Update]]


class ExecutionData(BaseModel):
"""Data unique to a step functions execution that is required at later stages."""

input_dir_path: str
31 changes: 30 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,36 @@ def s3_client():


@pytest.fixture()
def parser_output_json() -> dict:
def parser_output_json_pdf() -> dict:
"""A dictionary representation of a parser output"""
with open("tests/test_data/valid/test_pdf.json") as f:
return json.load(f)


@pytest.fixture()
def parser_output_json_html() -> dict:
"""A dictionary representation of a parser output"""
with open("tests/test_data/valid/test_html.json") as f:
return json.load(f)


@pytest.fixture()
def backend_document_json() -> dict:
"""A dictionary representation of a backend document"""
return {
"name": "test_name",
"description": "test_description",
"import_id": "test_import_id",
"family_import_id": "test_family_import_id",
"slug": "test_slug",
"publication_ts": "2021-01-01T00:00:00+00:00",
"date": "01/01/2021",
"source_url": "test_source_url",
"download_url": "test_download_url",
"type": "test_type",
"source": "test_source",
"category": "test_category",
"geography": "test_geography",
"languages": ["test_language"],
"metadata": {"test_metadata": "test_value"},
}
21 changes: 21 additions & 0 deletions tests/test_data/invalid/test_html.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,27 @@
"languages": [
"en"
],
"document_metadata": {
"name": "test_pdf",
"description": "test_pdf_description",
"import_id": "CCLW.executive.1003.0",
"family_import_id": "CCLW.executive.1003",
"slug": "test_pdf",
"publication_ts": "2022-10-25 12:43:00.869045",
"source_url": "https://cdn.climatepolicyradar.org/EUR/2013/EUR-2013-01-01-Overview+of+CAP+Reform+2014-2020_6237180d8c443d72c06c9167019ca177.pdf",
"download_url": "https://cdn.climatepolicyradar.org/EUR/2013/EUR-2013-01-01-Overview+of+CAP+Reform+2014-2020_6237180d8c443d72c06c9167019ca177.pdf",
"geography": "test_geo",
"category": "test_category",
"source": "test_source",
"type": "test_type",
"sectors": ["sector1", "sector2"],
"languages": [
"en"
],
"metadata": {
"test_key": "test_value"
}
},
"translated": false,
"document_slug": "YYY",
"document_content_type": "text/html",
Expand Down
Loading

0 comments on commit 591171d

Please sign in to comment.