Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
Merge pull request #47 from climatepolicyradar/feature/pdct-471-bug-f…
Browse files Browse the repository at this point in the history
…ix-parser-output-method-that-returns-bad-text

Adding exception to the method.
  • Loading branch information
THOR300 authored Oct 5, 2023
2 parents 56217c3 + bb01714 commit 8964970
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 19 deletions.
15 changes: 14 additions & 1 deletion src/cpr_data_access/parser_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
logger = logging.getLogger(__name__)


class VerticalFlipError(Exception):
"""Exception for when a vertical flip fails."""

pass


class BlockType(str, Enum):
"""
List of possible block types from the PubLayNet model.
Expand Down Expand Up @@ -319,6 +325,10 @@ def vertically_flip_text_block_coords(self: _PO) -> _PO:
Flips the coordinates of all PDF text blocks vertically.
Acts in-place on the coordinates in the ParserOutput object.
Should the document fail to flip, a VerticalFlipError is raised. This is most
commonly due to a page number being referenced in a text block that doesn't
exist in the page_metadata mapping.
"""

if self.pdf_data is None:
Expand All @@ -344,11 +354,14 @@ def vertically_flip_text_block_coords(self: _PO) -> _PO:
text_block.coords[1],
text_block.coords[0],
]
except Exception:
except Exception as e:
logger.exception(
"Error flipping text block coordinates.",
extra={"props": {"document_id": self.document_id}},
)
raise VerticalFlipError(
f"Failed to flip text blocks for {self.document_id}"
) from e

return self

Expand Down
40 changes: 22 additions & 18 deletions tests/test_parser_models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import unittest

import pydantic
import pytest

from cpr_data_access.parser_models import (
ParserInput,
ParserOutput,
VerticalFlipError,
PDFTextBlock,
)
from cpr_data_access.pipeline_general_models import (
CONTENT_TYPE_PDF,
Expand Down Expand Up @@ -47,45 +48,37 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) -
parser_output_no_pdf_data["pdf_data"] = None
parser_output_no_pdf_data["document_content_type"] = CONTENT_TYPE_PDF

with unittest.TestCase().assertRaises(
pydantic.error_wrappers.ValidationError
) as context:
with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
ParserOutput.parse_obj(parser_output_no_pdf_data)
assert "pdf_data must be set for PDF documents" in str(context.exception)
assert "pdf_data must be set for PDF documents" in str(context.value)

parser_output_no_html_data = parser_output_json_pdf.copy()
parser_output_no_html_data["html_data"] = None
parser_output_no_html_data["document_content_type"] = CONTENT_TYPE_HTML

with unittest.TestCase().assertRaises(
pydantic.error_wrappers.ValidationError
) as context:
with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
ParserOutput.parse_obj(parser_output_no_html_data)
assert "html_data must be set for HTML documents" in str(context.exception)
assert "html_data must be set for HTML documents" in str(context.value)

parser_output_no_content_type = parser_output_json_pdf.copy()
# PDF data is set as the default
parser_output_no_content_type["document_content_type"] = None

with unittest.TestCase().assertRaises(
pydantic.error_wrappers.ValidationError
) as context:
with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
ParserOutput.parse_obj(parser_output_no_content_type)
assert (
"html_data and pdf_data must be null for documents with no content type."
) in str(context.exception)
) in str(context.value)

parser_output_not_known_content_type = parser_output_json_pdf.copy()
# PDF data is set as the default
parser_output_not_known_content_type["document_content_type"] = "not_known"

with unittest.TestCase().assertRaises(
pydantic.error_wrappers.ValidationError
) as context:
with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
ParserOutput.parse_obj(parser_output_not_known_content_type)
assert (
"html_data and pdf_data must be null for documents with no content type."
) in str(context.exception)
) in str(context.value)

# Test the text blocks property
assert ParserOutput.parse_obj(parser_output_json_pdf).text_blocks != []
Expand All @@ -103,6 +96,17 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) -
original_text_blocks = parser_output.text_blocks
assert parser_output.vertically_flip_text_block_coords() != original_text_blocks

parser_output = ParserOutput.parse_obj(parser_output_json_pdf)
# Set as page number that doesn't exist in the page_metadata field to throw exception
assert isinstance(parser_output.text_blocks[0], PDFTextBlock)
parser_output.text_blocks[0].page_number = 123456 # type: ignore

with pytest.raises(VerticalFlipError) as context:
parser_output.vertically_flip_text_block_coords()
assert str(context.value) == (
f"Failed to flip text blocks for {parser_output.document_id}"
)

# Test the get_text_blocks method
# The test html document has invalid html data so the text blocks should be empty
parser_output = ParserOutput.parse_obj(parser_output_json_html)
Expand Down

0 comments on commit 8964970

Please sign in to comment.