From 00aded4a6838d966cd8cfd45eab397a38d951fa8 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 7 Nov 2023 10:39:32 +0000 Subject: [PATCH 1/3] Adding a failing test. --- tests/test_parser_models.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index 31de890..17f346e 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -5,7 +5,7 @@ ParserInput, ParserOutput, VerticalFlipError, - PDFTextBlock, + PDFTextBlock ) from cpr_data_access.pipeline_general_models import ( CONTENT_TYPE_PDF, @@ -134,3 +134,14 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) - == text_blocks_include_invalid == text_blocks_not_include_invalid ) + + # Test that the correct validation error is thrown during instantiation + parser_output_json_bad_text_block = parser_output_json_pdf.copy() + parser_output_json_bad_text_block["pdf_data"]["text_blocks"][0]["type"] = ( + "ThisBlockTypeDoesNotExist" + ) + with pytest.raises(pydantic.error_wrappers.ValidationError) as context: + ParserOutput.parse_obj(parser_output_json_bad_text_block) + assert str(context.value) == ( + f"Random" + ) From 1dd0454fc49dba976a12e7ce5b6424eeac76b2f2 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 7 Nov 2023 11:44:10 +0000 Subject: [PATCH 2/3] Updating the ordering of validation and adding a test. --- dumy.py | 0 src/cpr_data_access/parser_models.py | 9 +++++---- tests/test_parser_models.py | 4 +--- 3 files changed, 6 insertions(+), 7 deletions(-) create mode 100644 dumy.py diff --git a/dumy.py b/dumy.py new file mode 100644 index 0000000..e69de29 diff --git a/src/cpr_data_access/parser_models.py b/src/cpr_data_access/parser_models.py index 6720a76..44058ef 100644 --- a/src/cpr_data_access/parser_models.py +++ b/src/cpr_data_access/parser_models.py @@ -6,7 +6,7 @@ from collections import Counter from deprecation import deprecated -from pydantic import BaseModel, AnyHttpUrl, Field, root_validator +from pydantic import BaseModel, AnyHttpUrl, Field, root_validator, validator from langdetect import DetectorFactory, LangDetectException, detect from cpr_data_access.pipeline_general_models import ( @@ -193,8 +193,8 @@ class BaseParserOutput(BaseModel): pdf_data: Optional[PDFData] = None pipeline_metadata: Json = {} # note: defaulting to {} here is safe (pydantic) - @root_validator - def check_html_pdf_metadata(cls, values): + @validator("pdf_data") + def check_html_pdf_metadata(cls, value, values, config, field): """ Validate the relationship between content-type and the data that is set. @@ -204,6 +204,7 @@ def check_html_pdf_metadata(cls, values): Check that if the content-type is not HTML or PDF, then html_data and pdf_data are both null. """ + values["pdf_data"] = value if ( values["document_content_type"] == CONTENT_TYPE_HTML and values["html_data"] is None @@ -224,7 +225,7 @@ def check_html_pdf_metadata(cls, values): "html_data and pdf_data must be null for documents with no content type." ) - return values + return values["pdf_data"] def get_text_blocks(self, including_invalid_html=False) -> Sequence[TextBlock]: """A method for getting text blocks with the option to include invalid html.""" diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index 17f346e..c6e9c8a 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -142,6 +142,4 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) - ) with pytest.raises(pydantic.error_wrappers.ValidationError) as context: ParserOutput.parse_obj(parser_output_json_bad_text_block) - assert str(context.value) == ( - f"Random" - ) + assert "value is not a valid enumeration member" in str(context.value) From 6257921e38b23294d5e42906b9d847f075d6013f Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 7 Nov 2023 11:49:59 +0000 Subject: [PATCH 3/3] Refactoring --- dumy.py | 0 src/cpr_data_access/parser_models.py | 6 +++--- tests/test_parser_models.py | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) delete mode 100644 dumy.py diff --git a/dumy.py b/dumy.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/cpr_data_access/parser_models.py b/src/cpr_data_access/parser_models.py index 44058ef..78d805c 100644 --- a/src/cpr_data_access/parser_models.py +++ b/src/cpr_data_access/parser_models.py @@ -6,7 +6,7 @@ from collections import Counter from deprecation import deprecated -from pydantic import BaseModel, AnyHttpUrl, Field, root_validator, validator +from pydantic import BaseModel, AnyHttpUrl, Field, validator from langdetect import DetectorFactory, LangDetectException, detect from cpr_data_access.pipeline_general_models import ( @@ -193,8 +193,8 @@ class BaseParserOutput(BaseModel): pdf_data: Optional[PDFData] = None pipeline_metadata: Json = {} # note: defaulting to {} here is safe (pydantic) - @validator("pdf_data") - def check_html_pdf_metadata(cls, value, values, config, field): + @validator("pdf_data") # Validate the pdf_data field as it is ordered last + def check_html_pdf_metadata(cls, value, values): """ Validate the relationship between content-type and the data that is set. diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index c6e9c8a..6cdbf46 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -5,7 +5,7 @@ ParserInput, ParserOutput, VerticalFlipError, - PDFTextBlock + PDFTextBlock, ) from cpr_data_access.pipeline_general_models import ( CONTENT_TYPE_PDF, @@ -137,9 +137,9 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) - # Test that the correct validation error is thrown during instantiation parser_output_json_bad_text_block = parser_output_json_pdf.copy() - parser_output_json_bad_text_block["pdf_data"]["text_blocks"][0]["type"] = ( - "ThisBlockTypeDoesNotExist" - ) + parser_output_json_bad_text_block["pdf_data"]["text_blocks"][0][ + "type" + ] = "ThisBlockTypeDoesNotExist" with pytest.raises(pydantic.error_wrappers.ValidationError) as context: ParserOutput.parse_obj(parser_output_json_bad_text_block) assert "value is not a valid enumeration member" in str(context.value)