Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
Merge pull request #27 from climatepolicyradar/feature/rnd-219-update…
Browse files Browse the repository at this point in the history
…-block-types-and-parser-output-methods-in-data-access

Azure Data Model Experimental Updates
  • Loading branch information
THOR300 authored Aug 17, 2023
2 parents 91736ba + 090156e commit c3dfce4
Show file tree
Hide file tree
Showing 5 changed files with 212 additions and 19 deletions.
24 changes: 19 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ tqdm = "^4.64.1"
aws-error-utils = "^2.7.0"
pandas = "^1.5.3"
datasets = "^2.10.1"
langdetect = "^1.0.9"

[tool.poetry.group.dev.dependencies]
pre-commit = "^2.20.0"
Expand Down
114 changes: 100 additions & 14 deletions src/cpr_data_access/parser_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
import logging.config
from datetime import date
from enum import Enum
from typing import Optional, Sequence, Tuple, List

from typing import Optional, Sequence, Tuple, List, Union
from collections import Counter
from pydantic import BaseModel, AnyHttpUrl, Field, root_validator
from langdetect import DetectorFactory, LangDetectException
from langdetect import detect

logger = logging.getLogger(__name__)

Expand All @@ -27,8 +29,15 @@ class BlockType(str, Enum):
TABLE = "Table"
FIGURE = "Figure"
INFERRED = "Inferred from gaps"
AMBIGUOUS = "Ambiguous" # TODO: remove this when OCRProcessor._infer_block_type is implemented
# TODO: remove this when OCRProcessor._infer_block_type is implemented
AMBIGUOUS = "Ambiguous"
GOOGLE_BLOCK = "Google Text Block"
PAGE_HEADER = "pageHeader"
PAGE_FOOTER = "pageFooter"
TITLE_LOWER_CASE = "title"
SECTION_HEADING = "sectionHeading"
PAGE_NUMBER = "pageNumber"
DOCUMENT_HEADER = "Document Header"


class TextBlock(BaseModel):
Expand Down Expand Up @@ -157,44 +166,121 @@ class ParserOutput(BaseModel):

@root_validator
def check_html_pdf_metadata(cls, values):
"""Check that html_data is set if content_type is HTML, or pdf_data is set if content_type is PDF."""
"""
Validate the relationship between content-type and the data that is set.
Check that html_data is set if content_type is HTML, or pdf_data is set if
content_type is PDF.
Check that if the content-type is not HTML or PDF, then html_data and pdf_data
are both null.
"""
if (
values["document_content_type"] == CONTENT_TYPE_HTML
and values["html_data"] is None
):
raise ValueError("html_metadata must be set for HTML documents")
raise ValueError("html_data must be set for HTML documents")

if (
values["document_content_type"] == CONTENT_TYPE_PDF
and values["pdf_data"] is None
):
raise ValueError("pdf_data must be set for PDF documents")

if values["document_content_type"] is None and (
values["html_data"] is not None or values["pdf_data"] is not None
):
if values["document_content_type"] not in {
CONTENT_TYPE_HTML,
CONTENT_TYPE_PDF,
} and (values["html_data"] is not None or values["pdf_data"] is not None):
raise ValueError(
"html_metadata and pdf_metadata must be null for documents with no content type."
"html_data and pdf_data must be null for documents with no content type."
)

return values

@property
def text_blocks(self) -> Sequence[TextBlock]: # type: ignore
def text_blocks(self) -> Sequence[TextBlock]:
"""
Return the text blocks in the document. These could differ in format depending on the content type.
Return the text blocks in the document.
These could differ in format depending on the content type.
:return: Sequence[TextBlock]
"""

if self.document_content_type == CONTENT_TYPE_HTML:
return self.html_data.text_blocks # type: ignore
html_data: Union[HTMLData, None] = self.html_data
return html_data.text_blocks if html_data else []
elif self.document_content_type == CONTENT_TYPE_PDF:
return self.pdf_data.text_blocks # type: ignore
pdf_data: Union[PDFData, None] = self.pdf_data
return pdf_data.text_blocks if pdf_data else []
return []

def to_string(self) -> str: # type: ignore
"""Return the text blocks in the parser output as a string"""

return " ".join(
[text_block.to_string().strip() for text_block in self.text_blocks]
)

def detect_and_set_languages(self) -> "ParserOutput":
"""
Detect language of the text and set the language attribute.
Return an instance of ParserOutput with the language attribute set. Assumes
that a document only has one language.
"""

if self.document_content_type != CONTENT_TYPE_HTML:
logger.warning(
"Language detection should not be required for non-HTML documents, "
"but it has been run on one. This will overwrite any document "
"languages detected via other means, e.g. OCR. "
)

# language detection is not deterministic, so we need to set a seed
DetectorFactory.seed = 0

if len(self.text_blocks) > 0:
try:
detected_language = detect(self.to_string())
except LangDetectException:
logger.warning(
"Language detection failed for document with id %s",
self.document_id,
)
detected_language = None
self.languages = [detected_language] if detected_language else []
for text_block in self.text_blocks:
text_block.language = detected_language

return self

def set_document_languages_from_text_blocks(
self, min_language_proportion: float = 0.4
):
"""
Store the document languages attribute as part of the object.
Done by getting all languages with proportion above `min_language_proportion`.
:attribute min_language_proportion: Minimum proportion of text blocks in a
language for it to be considered a language of the document.
"""

all_text_block_languages = [
text_block.language for text_block in self.text_blocks
]

if all([lang is None for lang in all_text_block_languages]):
self.languages = None

else:
lang_counter = Counter(
[lang for lang in all_text_block_languages if lang is not None]
)
self.languages = [
lang
for lang, count in lang_counter.items()
if count / len(all_text_block_languages) > min_language_proportion
]

return self
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from pathlib import Path

import pytest
Expand Down Expand Up @@ -32,3 +33,10 @@ def s3_client():
s3_client.create_bucket(Bucket="empty-bucket")

yield s3_client


@pytest.fixture()
def parser_output_json() -> dict:
"""A dictionary representation of a parser output"""
with open("tests/test_data/valid/test_pdf.json") as f:
return json.load(f)
84 changes: 84 additions & 0 deletions tests/test_parser_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import unittest

import pydantic

from cpr_data_access.parser_models import (
ParserOutput,
CONTENT_TYPE_PDF,
CONTENT_TYPE_HTML,
)


def test_parser_output_object(parser_output_json):
"""
Test that we correctly instantiate the parser output object.
Also test the methods on the parser output object.
"""

# Instantiate the parser output object
ParserOutput.parse_obj(parser_output_json)

# Test the optional fields
parser_output_empty_fields = parser_output_json.copy()
parser_output_empty_fields["document_metadata"] = {}
parser_output_empty_fields["document_cdn_object"] = None
parser_output_empty_fields["document_md5_sum"] = None

ParserOutput.parse_obj(parser_output_empty_fields)

# Test the check html pdf metadata method
parser_output_no_pdf_data = parser_output_json.copy()
parser_output_no_pdf_data["pdf_data"] = None
parser_output_no_pdf_data["document_content_type"] = CONTENT_TYPE_PDF

with unittest.TestCase().assertRaises(
pydantic.error_wrappers.ValidationError
) as context:
ParserOutput.parse_obj(parser_output_no_pdf_data)
assert "pdf_data must be set for PDF documents" in str(context.exception)

parser_output_no_html_data = parser_output_json.copy()
parser_output_no_html_data["html_data"] = None
parser_output_no_html_data["document_content_type"] = CONTENT_TYPE_HTML

with unittest.TestCase().assertRaises(
pydantic.error_wrappers.ValidationError
) as context:
ParserOutput.parse_obj(parser_output_no_html_data)
assert "html_data must be set for HTML documents" in str(context.exception)

parser_output_no_content_type = parser_output_json.copy()
# PDF data is set as the default
parser_output_no_content_type["document_content_type"] = None

with unittest.TestCase().assertRaises(
pydantic.error_wrappers.ValidationError
) as context:
ParserOutput.parse_obj(parser_output_no_content_type)
assert (
"html_data and pdf_data must be null for documents with no content type."
) in str(context.exception)

parser_output_not_known_content_type = parser_output_json.copy()
# PDF data is set as the default
parser_output_not_known_content_type["document_content_type"] = "not_known"

with unittest.TestCase().assertRaises(
pydantic.error_wrappers.ValidationError
) as context:
ParserOutput.parse_obj(parser_output_not_known_content_type)
assert (
"html_data and pdf_data must be null for documents with no content type."
) in str(context.exception)

# Test the text blocks property
assert ParserOutput.parse_obj(parser_output_json).text_blocks != []
parser_output_no_data = parser_output_json.copy()
parser_output_no_data["pdf_data"] = None
parser_output_no_data["document_content_type"] = None
assert ParserOutput.parse_obj(parser_output_no_data).text_blocks == []

# Test the to string method
assert ParserOutput.parse_obj(parser_output_json).to_string() != ""
assert ParserOutput.parse_obj(parser_output_no_data).to_string() == ""

0 comments on commit c3dfce4

Please sign in to comment.