Merge pull request #87 from climatepolicyradar/bugfix/raise-correct-e…

…rror Bug Fix - Raise Correct Exception
climatepolicyradar · Nov 8, 2023 · 0a2bc68 · 0a2bc68
2 parents c841aab + 6257921
commit 0a2bc68
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 4 deletions.
diff --git a/src/cpr_data_access/parser_models.py b/src/cpr_data_access/parser_models.py
@@ -6,7 +6,7 @@
 from collections import Counter
 
 from deprecation import deprecated
-from pydantic import BaseModel, AnyHttpUrl, Field, root_validator
+from pydantic import BaseModel, AnyHttpUrl, Field, validator
 from langdetect import DetectorFactory, LangDetectException, detect
 
 from cpr_data_access.pipeline_general_models import (
@@ -193,8 +193,8 @@ class BaseParserOutput(BaseModel):
     pdf_data: Optional[PDFData] = None
     pipeline_metadata: Json = {}  # note: defaulting to {} here is safe (pydantic)
 
-    @root_validator
-    def check_html_pdf_metadata(cls, values):
+    @validator("pdf_data")  # Validate the pdf_data field as it is ordered last
+    def check_html_pdf_metadata(cls, value, values):
         """
         Validate the relationship between content-type and the data that is set.
 
@@ -204,6 +204,7 @@ def check_html_pdf_metadata(cls, values):
         Check that if the content-type is not HTML or PDF, then html_data and pdf_data
         are both null.
         """
+        values["pdf_data"] = value
         if (
             values["document_content_type"] == CONTENT_TYPE_HTML
             and values["html_data"] is None
@@ -224,7 +225,7 @@ def check_html_pdf_metadata(cls, values):
                 "html_data and pdf_data must be null for documents with no content type."
             )
 
-        return values
+        return values["pdf_data"]
 
     def get_text_blocks(self, including_invalid_html=False) -> Sequence[TextBlock]:
         """A method for getting text blocks with the option to include invalid html."""

diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py
@@ -134,3 +134,12 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) -
         == text_blocks_include_invalid
         == text_blocks_not_include_invalid
     )
+
+    # Test that the correct validation error is thrown during instantiation
+    parser_output_json_bad_text_block = parser_output_json_pdf.copy()
+    parser_output_json_bad_text_block["pdf_data"]["text_blocks"][0][
+        "type"
+    ] = "ThisBlockTypeDoesNotExist"
+    with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
+        ParserOutput.parse_obj(parser_output_json_bad_text_block)
+    assert "value is not a valid enumeration member" in str(context.value)