Merge pull request #27 from climatepolicyradar/feature/rnd-219-update…

…-block-types-and-parser-output-methods-in-data-access Azure Data Model Experimental Updates
climatepolicyradar · Aug 17, 2023 · c3dfce4 · c3dfce4
2 parents 91736ba + 090156e
commit c3dfce4
Show file tree

Hide file tree

Showing 5 changed files with 212 additions and 19 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ tqdm = "^4.64.1"
 aws-error-utils = "^2.7.0"
 pandas = "^1.5.3"
 datasets = "^2.10.1"
+langdetect = "^1.0.9"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^2.20.0"

diff --git a/src/cpr_data_access/parser_models.py b/src/cpr_data_access/parser_models.py
@@ -4,9 +4,11 @@
 import logging.config
 from datetime import date
 from enum import Enum
-from typing import Optional, Sequence, Tuple, List
-
+from typing import Optional, Sequence, Tuple, List, Union
+from collections import Counter
 from pydantic import BaseModel, AnyHttpUrl, Field, root_validator
+from langdetect import DetectorFactory, LangDetectException
+from langdetect import detect
 
 logger = logging.getLogger(__name__)
 
@@ -27,8 +29,15 @@ class BlockType(str, Enum):
     TABLE = "Table"
     FIGURE = "Figure"
     INFERRED = "Inferred from gaps"
-    AMBIGUOUS = "Ambiguous"  # TODO: remove this when OCRProcessor._infer_block_type is implemented
+    # TODO: remove this when OCRProcessor._infer_block_type is implemented
+    AMBIGUOUS = "Ambiguous"
     GOOGLE_BLOCK = "Google Text Block"
+    PAGE_HEADER = "pageHeader"
+    PAGE_FOOTER = "pageFooter"
+    TITLE_LOWER_CASE = "title"
+    SECTION_HEADING = "sectionHeading"
+    PAGE_NUMBER = "pageNumber"
+    DOCUMENT_HEADER = "Document Header"
 
 
 class TextBlock(BaseModel):
@@ -157,44 +166,121 @@ class ParserOutput(BaseModel):
 
     @root_validator
     def check_html_pdf_metadata(cls, values):
-        """Check that html_data is set if content_type is HTML, or pdf_data is set if content_type is PDF."""
+        """
+        Validate the relationship between content-type and the data that is set.
+
+        Check that html_data is set if content_type is HTML, or pdf_data is set if
+        content_type is PDF.
+
+        Check that if the content-type is not HTML or PDF, then html_data and pdf_data
+        are both null.
+        """
         if (
             values["document_content_type"] == CONTENT_TYPE_HTML
             and values["html_data"] is None
         ):
-            raise ValueError("html_metadata must be set for HTML documents")
+            raise ValueError("html_data must be set for HTML documents")
 
         if (
             values["document_content_type"] == CONTENT_TYPE_PDF
             and values["pdf_data"] is None
         ):
             raise ValueError("pdf_data must be set for PDF documents")
 
-        if values["document_content_type"] is None and (
-            values["html_data"] is not None or values["pdf_data"] is not None
-        ):
+        if values["document_content_type"] not in {
+            CONTENT_TYPE_HTML,
+            CONTENT_TYPE_PDF,
+        } and (values["html_data"] is not None or values["pdf_data"] is not None):
             raise ValueError(
-                "html_metadata and pdf_metadata must be null for documents with no content type."
+                "html_data and pdf_data must be null for documents with no content type."
             )
 
         return values
 
     @property
-    def text_blocks(self) -> Sequence[TextBlock]:  # type: ignore
+    def text_blocks(self) -> Sequence[TextBlock]:
         """
-        Return the text blocks in the document. These could differ in format depending on the content type.
+        Return the text blocks in the document.
+
+        These could differ in format depending on the content type.
 
         :return: Sequence[TextBlock]
         """
-
         if self.document_content_type == CONTENT_TYPE_HTML:
-            return self.html_data.text_blocks  # type: ignore
+            html_data: Union[HTMLData, None] = self.html_data
+            return html_data.text_blocks if html_data else []
         elif self.document_content_type == CONTENT_TYPE_PDF:
-            return self.pdf_data.text_blocks  # type: ignore
+            pdf_data: Union[PDFData, None] = self.pdf_data
+            return pdf_data.text_blocks if pdf_data else []
+        return []
 
     def to_string(self) -> str:  # type: ignore
         """Return the text blocks in the parser output as a string"""
 
         return " ".join(
             [text_block.to_string().strip() for text_block in self.text_blocks]
         )
+
+    def detect_and_set_languages(self) -> "ParserOutput":
+        """
+        Detect language of the text and set the language attribute.
+
+        Return an instance of ParserOutput with the language attribute set. Assumes
+        that a document only has one language.
+        """
+
+        if self.document_content_type != CONTENT_TYPE_HTML:
+            logger.warning(
+                "Language detection should not be required for non-HTML documents, "
+                "but it has been run on one. This will overwrite any document "
+                "languages detected via other means, e.g. OCR. "
+            )
+
+        # language detection is not deterministic, so we need to set a seed
+        DetectorFactory.seed = 0
+
+        if len(self.text_blocks) > 0:
+            try:
+                detected_language = detect(self.to_string())
+            except LangDetectException:
+                logger.warning(
+                    "Language detection failed for document with id %s",
+                    self.document_id,
+                )
+                detected_language = None
+            self.languages = [detected_language] if detected_language else []
+            for text_block in self.text_blocks:
+                text_block.language = detected_language
+
+        return self
+
+    def set_document_languages_from_text_blocks(
+        self, min_language_proportion: float = 0.4
+    ):
+        """
+        Store the document languages attribute as part of the object.
+
+        Done by getting all languages with proportion above `min_language_proportion`.
+
+        :attribute min_language_proportion: Minimum proportion of text blocks in a
+        language for it to be considered a language of the document.
+        """
+
+        all_text_block_languages = [
+            text_block.language for text_block in self.text_blocks
+        ]
+
+        if all([lang is None for lang in all_text_block_languages]):
+            self.languages = None
+
+        else:
+            lang_counter = Counter(
+                [lang for lang in all_text_block_languages if lang is not None]
+            )
+            self.languages = [
+                lang
+                for lang, count in lang_counter.items()
+                if count / len(all_text_block_languages) > min_language_proportion
+            ]
+
+        return self
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,3 +1,4 @@
+import json
 from pathlib import Path
 
 import pytest
@@ -32,3 +33,10 @@ def s3_client():
         s3_client.create_bucket(Bucket="empty-bucket")
 
         yield s3_client
+
+
+@pytest.fixture()
+def parser_output_json() -> dict:
+    """A dictionary representation of a parser output"""
+    with open("tests/test_data/valid/test_pdf.json") as f:
+        return json.load(f)
diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py
@@ -0,0 +1,84 @@
+import unittest
+
+import pydantic
+
+from cpr_data_access.parser_models import (
+    ParserOutput,
+    CONTENT_TYPE_PDF,
+    CONTENT_TYPE_HTML,
+)
+
+
+def test_parser_output_object(parser_output_json):
+    """
+    Test that we correctly instantiate the parser output object.
+
+    Also test the methods on the parser output object.
+    """
+
+    # Instantiate the parser output object
+    ParserOutput.parse_obj(parser_output_json)
+
+    # Test the optional fields
+    parser_output_empty_fields = parser_output_json.copy()
+    parser_output_empty_fields["document_metadata"] = {}
+    parser_output_empty_fields["document_cdn_object"] = None
+    parser_output_empty_fields["document_md5_sum"] = None
+
+    ParserOutput.parse_obj(parser_output_empty_fields)
+
+    # Test the check html pdf metadata method
+    parser_output_no_pdf_data = parser_output_json.copy()
+    parser_output_no_pdf_data["pdf_data"] = None
+    parser_output_no_pdf_data["document_content_type"] = CONTENT_TYPE_PDF
+
+    with unittest.TestCase().assertRaises(
+        pydantic.error_wrappers.ValidationError
+    ) as context:
+        ParserOutput.parse_obj(parser_output_no_pdf_data)
+    assert "pdf_data must be set for PDF documents" in str(context.exception)
+
+    parser_output_no_html_data = parser_output_json.copy()
+    parser_output_no_html_data["html_data"] = None
+    parser_output_no_html_data["document_content_type"] = CONTENT_TYPE_HTML
+
+    with unittest.TestCase().assertRaises(
+        pydantic.error_wrappers.ValidationError
+    ) as context:
+        ParserOutput.parse_obj(parser_output_no_html_data)
+    assert "html_data must be set for HTML documents" in str(context.exception)
+
+    parser_output_no_content_type = parser_output_json.copy()
+    # PDF data is set as the default
+    parser_output_no_content_type["document_content_type"] = None
+
+    with unittest.TestCase().assertRaises(
+        pydantic.error_wrappers.ValidationError
+    ) as context:
+        ParserOutput.parse_obj(parser_output_no_content_type)
+    assert (
+        "html_data and pdf_data must be null for documents with no content type."
+    ) in str(context.exception)
+
+    parser_output_not_known_content_type = parser_output_json.copy()
+    # PDF data is set as the default
+    parser_output_not_known_content_type["document_content_type"] = "not_known"
+
+    with unittest.TestCase().assertRaises(
+        pydantic.error_wrappers.ValidationError
+    ) as context:
+        ParserOutput.parse_obj(parser_output_not_known_content_type)
+    assert (
+        "html_data and pdf_data must be null for documents with no content type."
+    ) in str(context.exception)
+
+    # Test the text blocks property
+    assert ParserOutput.parse_obj(parser_output_json).text_blocks != []
+    parser_output_no_data = parser_output_json.copy()
+    parser_output_no_data["pdf_data"] = None
+    parser_output_no_data["document_content_type"] = None
+    assert ParserOutput.parse_obj(parser_output_no_data).text_blocks == []
+
+    # Test the to string method
+    assert ParserOutput.parse_obj(parser_output_json).to_string() != ""
+    assert ParserOutput.parse_obj(parser_output_no_data).to_string() == ""