Skip to content

Commit

Permalink
Fix bug with table outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 21, 2025
1 parent 355d11b commit a5df6a2
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 4 deletions.
5 changes: 3 additions & 2 deletions marker/schema/blocks/basetable.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class BaseTable(Block):
html: str | None = None

def format_cells(self, document, child_blocks):
child_cells: List[TableCell] = [document.get_block(c.id) for c in child_blocks]
child_cells: List[TableCell] = [document.get_block(c.id) for c in child_blocks if c.id.block_type == BlockTypes.TableCell]
unique_rows = sorted(list(set([c.row_id for c in child_cells])))
html_repr = "<table><tbody>"
for row_id in unique_rows:
Expand All @@ -28,10 +28,11 @@ def assemble_html(self, document, child_blocks: List[BlockOutput], parent_struct
child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
template = super().assemble_html(document, child_ref_blocks, parent_structure)

child_block_types = set([c.id.block_type for c in child_blocks])
if self.html:
# LLM processor
return template + self.html
elif len(child_blocks) > 0 and child_blocks[0].id.block_type == BlockTypes.TableCell:
elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
# Table processor
return template + self.format_cells(document, child_blocks)
else:
Expand Down
3 changes: 2 additions & 1 deletion marker/scripts/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ async def _convert_pdf(params: CommonParams):
config_parser = ConfigParser(options)
config_dict = config_parser.generate_config_dict()
config_dict["pdftext_workers"] = 1
converter = PdfConverter(
converter_cls = PdfConverter
converter = converter_cls(
config=config_dict,
artifact_dict=app_data["models"],
processor_list=config_parser.get_processors(),
Expand Down
3 changes: 2 additions & 1 deletion marker/scripts/streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def load_models():
def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict):
config_dict = config_parser.generate_config_dict()
config_dict["pdftext_workers"] = 1
converter = PdfConverter(
converter_cls = PdfConverter
converter = converter_cls(
config=config_dict,
artifact_dict=model_dict,
processor_list=config_parser.get_processors(),
Expand Down
3 changes: 3 additions & 0 deletions tests/processors/test_table_processor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest
from marker.renderers.json import JSONRenderer

from marker.renderers.markdown import MarkdownRenderer
from marker.schema import BlockTypes
Expand All @@ -18,6 +19,8 @@ def test_table_processor(pdf_document, detection_model, recognition_model, table
assert len(children) > 0
assert isinstance(children[0], TableCell)

assert len(pdf_document.contained_blocks((BlockTypes.Table,))) == 2

renderer = MarkdownRenderer()
table_output = renderer(pdf_document)
assert "Schedule" in table_output.markdown

0 comments on commit a5df6a2

Please sign in to comment.