Skip to content

Commit

Permalink
new fix_overlaps param
Browse files Browse the repository at this point in the history
  • Loading branch information
geli-gel committed Nov 8, 2023
1 parent 8336d06 commit 7af1e93
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 13 deletions.
10 changes: 6 additions & 4 deletions src/mmda/parsers/grobid_augment_existing_document_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,25 +117,27 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
for heading_box_group, paragraphs in section_headings_and_sentence_box_groups_in_paragraphs:
section_spans = []
if heading_box_group:
heading_span_group_in_list, unallocated_section_tokens_dict = (
heading_span_group_in_list = (
box_groups_to_span_groups(
[heading_box_group],
doc,
center=True,
unallocated_tokens_dict=unallocated_section_tokens_dict
unallocated_tokens_dict=unallocated_section_tokens_dict,
fix_overlaps=True,
)
)
heading_span_group = heading_span_group_in_list[0]
heading_span_groups.append(heading_span_group)
section_spans.extend(heading_span_group.spans)
this_section_paragraph_span_groups = []
for sentence_box_groups in paragraphs:
this_paragraph_sentence_span_groups, unallocated_section_tokens_dict = box_groups_to_span_groups(
this_paragraph_sentence_span_groups = box_groups_to_span_groups(
sentence_box_groups,
doc,
center=True,
pad_x=True,
unallocated_tokens_dict=unallocated_section_tokens_dict
unallocated_tokens_dict=unallocated_section_tokens_dict,
fix_overlaps=True,
)
if all([sg.spans for sg in this_paragraph_sentence_span_groups]):
sentence_span_groups.extend(this_paragraph_sentence_span_groups)
Expand Down
27 changes: 18 additions & 9 deletions src/mmda/utils/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,23 @@ def box_groups_to_span_groups(
doc,
pad_x: bool = False,
center: bool = False,
unallocated_tokens_dict: Optional[Dict[int, SpanGroup]] = None
unallocated_tokens_dict: Optional[Dict[int, SpanGroup]] = None,
fix_overlaps: bool = False,
) -> List[SpanGroup]:
"""Generate SpanGroups from BoxGroups given they can only generate spans of tokens not already allocated
Args
`box_groups` (List[BoxGroup])
`doc` (Document) base document annotated with pages, tokens, rows to
`center` (bool) if True, considers tokens to be overlapping with boxes only if their centers overlap
`center` (bool) if True, considers tokens to be overlapping with boxes only if their centers overlap
`unallocated_tokens` (Optional[Dict]) of token spangroups keyed by page. If provided, will use as starting
point for determining if token is already allocated. Assumes the tokens within are of the same type as the `doc`
(i.e., tokens from both doc and the dict both have their box data in either Span.box or SpanGroup.boxgroup)
point for determining if token is already allocated. Assumes the tokens within are of the same type as the
`doc` (i.e., tokens from both doc and the dict both have their box data in either Span.box or
SpanGroup.boxgroup)
`fix_overlaps` (bool) if True, will attempt to fix overlapping spans within a SpanGroup by omitting
spans from already allocated tokens that end up contained in the derived_spans that come from MergeSpans.
This allows for the possibility of a BoxGroup that covers text to end up with a SpanGroup that is missing
spans or even has no spans since a previous BoxGroup already allocated all the underlying tokens. This
reduces the possibility of SpanGroup overlap errors, but may not return the desired SpanGroups.
Returns
Union (either) of:
-List[SpanGroup] with each SpanGroup.spans corresponding to spans (sans boxes) of allocated tokens per box_group,
Expand Down Expand Up @@ -162,17 +169,19 @@ def omit_span_from_derived_spans(t_span):
if token_box_in_box_group:
if sg_token in unallocated_tokens[sg_token.box_group.boxes[0].page]:
unallocated_tokens[sg_token.box_group.boxes[0].page].remove(sg_token)
# otherwise, if it is in neither all_tokens_overlapping_box_group nor unallocated_tokens, the assumption
# is that the token has already been allocated by a different box_group, so, we need to remove it from our
# derived spans to avoid 'SpanGroup overlap' error.
# otherwise, if it is in neither all_tokens_overlapping_box_group nor unallocated_tokens,
# the assumption is that the token has already been allocated by a different box_group, so, we need
# to remove it from our derived spans to avoid 'SpanGroup overlap' error.
else:
omit_span_from_derived_spans(sg_token.spans[0])
if fix_overlaps:
omit_span_from_derived_spans(sg_token.spans[0])
else:
if sg_token in unallocated_tokens[sg_token.spans[0].box.page]:
unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token)
# same scenario as above.
else:
omit_span_from_derived_spans(sg_token.spans[0])
if fix_overlaps:
omit_span_from_derived_spans(sg_token.spans[0])

derived_span_groups.append(
SpanGroup(
Expand Down

0 comments on commit 7af1e93

Please sign in to comment.