From 4a45756e90ff07269e5688cda5bc56dc4dcd53ad Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 5 Oct 2023 15:06:14 +0100 Subject: [PATCH 1/6] Adding exception to the method. --- src/cpr_data_access/parser_models.py | 13 ++++++++++++- tests/test_parser_models.py | 12 +++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/cpr_data_access/parser_models.py b/src/cpr_data_access/parser_models.py index 22c4607..8cbf8f8 100644 --- a/src/cpr_data_access/parser_models.py +++ b/src/cpr_data_access/parser_models.py @@ -19,6 +19,14 @@ logger = logging.getLogger(__name__) +class VerticalFlipError(Exception): + """Exception for when a vertical flip fails.""" + + def __init__(self, message: str): + super().__init__(message) + self.message = message + + class BlockType(str, Enum): """ List of possible block types from the PubLayNet model. @@ -344,11 +352,14 @@ def vertically_flip_text_block_coords(self: _PO) -> _PO: text_block.coords[1], text_block.coords[0], ] - except Exception: + except Exception as e: logger.exception( "Error flipping text block coordinates.", extra={"props": {"document_id": self.document_id}}, ) + raise VerticalFlipError( + f"Failed to flip text blocks for {self.document_id}" + ) from e return self diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index 3bc0dfc..6f7a80f 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -4,7 +4,7 @@ from cpr_data_access.parser_models import ( ParserInput, - ParserOutput, + ParserOutput, VerticalFlipError, ) from cpr_data_access.pipeline_general_models import ( CONTENT_TYPE_PDF, @@ -103,6 +103,16 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) - original_text_blocks = parser_output.text_blocks assert parser_output.vertically_flip_text_block_coords() != original_text_blocks + parser_output = ParserOutput.parse_obj(parser_output_json_pdf) + # Set as page number that doesn't exist in the page_metadata field to throw exception + parser_output.text_blocks[0].page_number = 123456 + + with unittest.TestCase().assertRaises(VerticalFlipError) as context: + parser_output.vertically_flip_text_block_coords() + assert str(context.exception) == ( + f"Failed to flip text blocks for {parser_output.document_id}" + ) + # Test the get_text_blocks method # The test html document has invalid html data so the text blocks should be empty parser_output = ParserOutput.parse_obj(parser_output_json_html) From aa903ce101f4c5dfc717c052a7130e5384d3304d Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 5 Oct 2023 15:06:46 +0100 Subject: [PATCH 2/6] Refactoring. --- tests/test_parser_models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index 6f7a80f..d55df78 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -4,7 +4,8 @@ from cpr_data_access.parser_models import ( ParserInput, - ParserOutput, VerticalFlipError, + ParserOutput, + VerticalFlipError, ) from cpr_data_access.pipeline_general_models import ( CONTENT_TYPE_PDF, From a38fbaec0da6b06f3084c73afc36787f5de85fe8 Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 5 Oct 2023 15:09:57 +0100 Subject: [PATCH 3/6] Refactoring. --- src/cpr_data_access/parser_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpr_data_access/parser_models.py b/src/cpr_data_access/parser_models.py index 8cbf8f8..5a55a4f 100644 --- a/src/cpr_data_access/parser_models.py +++ b/src/cpr_data_access/parser_models.py @@ -358,7 +358,7 @@ def vertically_flip_text_block_coords(self: _PO) -> _PO: extra={"props": {"document_id": self.document_id}}, ) raise VerticalFlipError( - f"Failed to flip text blocks for {self.document_id}" + f"Failed to flip text blocks for {self.document_id}" ) from e return self From 75a2079f10e9949e5b08c3db104fd509c8f64ec5 Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 5 Oct 2023 15:21:40 +0100 Subject: [PATCH 4/6] Resolving PR comments. --- src/cpr_data_access/parser_models.py | 9 +++++---- tests/test_parser_models.py | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/cpr_data_access/parser_models.py b/src/cpr_data_access/parser_models.py index 5a55a4f..b08254c 100644 --- a/src/cpr_data_access/parser_models.py +++ b/src/cpr_data_access/parser_models.py @@ -21,10 +21,7 @@ class VerticalFlipError(Exception): """Exception for when a vertical flip fails.""" - - def __init__(self, message: str): - super().__init__(message) - self.message = message + pass class BlockType(str, Enum): @@ -327,6 +324,10 @@ def vertically_flip_text_block_coords(self: _PO) -> _PO: Flips the coordinates of all PDF text blocks vertically. Acts in-place on the coordinates in the ParserOutput object. + + Should the document fail to flip, a VerticalFlipError is raised. This is most + commonly due to a page number being referenced in a text block that doesn't + exist in the page_metadata mapping. """ if self.pdf_data is None: diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index d55df78..0fa1f6b 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -1,6 +1,7 @@ import unittest import pydantic +import pytest from cpr_data_access.parser_models import ( ParserInput, @@ -58,11 +59,11 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) - parser_output_no_html_data["html_data"] = None parser_output_no_html_data["document_content_type"] = CONTENT_TYPE_HTML - with unittest.TestCase().assertRaises( + with pytest.raises( pydantic.error_wrappers.ValidationError ) as context: ParserOutput.parse_obj(parser_output_no_html_data) - assert "html_data must be set for HTML documents" in str(context.exception) + assert "html_data must be set for HTML documents" in str(context.value) parser_output_no_content_type = parser_output_json_pdf.copy() # PDF data is set as the default From ca7cd44ec90e4d85108ee185616bd6f18717905f Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 5 Oct 2023 15:22:31 +0100 Subject: [PATCH 5/6] Refactoring. --- src/cpr_data_access/parser_models.py | 1 + tests/test_parser_models.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/cpr_data_access/parser_models.py b/src/cpr_data_access/parser_models.py index b08254c..37a7b41 100644 --- a/src/cpr_data_access/parser_models.py +++ b/src/cpr_data_access/parser_models.py @@ -21,6 +21,7 @@ class VerticalFlipError(Exception): """Exception for when a vertical flip fails.""" + pass diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index 0fa1f6b..e71e74f 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -59,9 +59,7 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) - parser_output_no_html_data["html_data"] = None parser_output_no_html_data["document_content_type"] = CONTENT_TYPE_HTML - with pytest.raises( - pydantic.error_wrappers.ValidationError - ) as context: + with pytest.raises(pydantic.error_wrappers.ValidationError) as context: ParserOutput.parse_obj(parser_output_no_html_data) assert "html_data must be set for HTML documents" in str(context.value) From bb0171468a4c227ef213b0625ed3cca7911813ff Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 5 Oct 2023 15:34:20 +0100 Subject: [PATCH 6/6] Refactoring. --- tests/test_parser_models.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index e71e74f..31de890 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -1,5 +1,3 @@ -import unittest - import pydantic import pytest @@ -7,6 +5,7 @@ ParserInput, ParserOutput, VerticalFlipError, + PDFTextBlock, ) from cpr_data_access.pipeline_general_models import ( CONTENT_TYPE_PDF, @@ -49,11 +48,9 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) - parser_output_no_pdf_data["pdf_data"] = None parser_output_no_pdf_data["document_content_type"] = CONTENT_TYPE_PDF - with unittest.TestCase().assertRaises( - pydantic.error_wrappers.ValidationError - ) as context: + with pytest.raises(pydantic.error_wrappers.ValidationError) as context: ParserOutput.parse_obj(parser_output_no_pdf_data) - assert "pdf_data must be set for PDF documents" in str(context.exception) + assert "pdf_data must be set for PDF documents" in str(context.value) parser_output_no_html_data = parser_output_json_pdf.copy() parser_output_no_html_data["html_data"] = None @@ -67,25 +64,21 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) - # PDF data is set as the default parser_output_no_content_type["document_content_type"] = None - with unittest.TestCase().assertRaises( - pydantic.error_wrappers.ValidationError - ) as context: + with pytest.raises(pydantic.error_wrappers.ValidationError) as context: ParserOutput.parse_obj(parser_output_no_content_type) assert ( "html_data and pdf_data must be null for documents with no content type." - ) in str(context.exception) + ) in str(context.value) parser_output_not_known_content_type = parser_output_json_pdf.copy() # PDF data is set as the default parser_output_not_known_content_type["document_content_type"] = "not_known" - with unittest.TestCase().assertRaises( - pydantic.error_wrappers.ValidationError - ) as context: + with pytest.raises(pydantic.error_wrappers.ValidationError) as context: ParserOutput.parse_obj(parser_output_not_known_content_type) assert ( "html_data and pdf_data must be null for documents with no content type." - ) in str(context.exception) + ) in str(context.value) # Test the text blocks property assert ParserOutput.parse_obj(parser_output_json_pdf).text_blocks != [] @@ -105,11 +98,12 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) - parser_output = ParserOutput.parse_obj(parser_output_json_pdf) # Set as page number that doesn't exist in the page_metadata field to throw exception - parser_output.text_blocks[0].page_number = 123456 + assert isinstance(parser_output.text_blocks[0], PDFTextBlock) + parser_output.text_blocks[0].page_number = 123456 # type: ignore - with unittest.TestCase().assertRaises(VerticalFlipError) as context: + with pytest.raises(VerticalFlipError) as context: parser_output.vertically_flip_text_block_coords() - assert str(context.exception) == ( + assert str(context.value) == ( f"Failed to flip text blocks for {parser_output.document_id}" )