From 0a3f525f0a2c8efbdfe55c5a27c3e8ac526662f9 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 23 Aug 2024 18:19:28 +0200 Subject: [PATCH 01/22] port processor to core v3 --- qurator/eynollah/processor.py | 89 +++++++++++------------------------ requirements.txt | 2 +- 2 files changed, 29 insertions(+), 62 deletions(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 1bd190e..c8748af 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -1,68 +1,35 @@ -from json import loads -from pkg_resources import resource_string -from tempfile import NamedTemporaryFile -from pathlib import Path -from os.path import join - -from PIL import Image - +from typing import Optional +from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models import OcrdPage from ocrd import Processor -from ocrd_modelfactory import page_from_file, exif_from_filename -from ocrd_models import OcrdFile, OcrdExif -from ocrd_models.ocrd_page import to_xml -from ocrd_utils import ( - getLogger, - MIMETYPE_PAGE, - assert_file_grp_cardinality, - make_file_id -) from .eynollah import Eynollah -from .utils.pil_cv2 import pil2cv - -OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) class EynollahProcessor(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) + @property + def metadata_location(self) -> str: + return 'eynollah/ocrd-tool.json' - def process(self): - LOG = getLogger('eynollah') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) - pcgts = page_from_file(self.workspace.download_file(input_file)) - LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight) - self.add_metadata(pcgts) - page = pcgts.get_Page() - # XXX loses DPI information - # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename - eynollah_kwargs = { - 'dir_models': self.resolve_resource(self.parameter['models']), - 'allow_enhancement': False, - 'curved_line': self.parameter['curved_line'], - 'full_layout': self.parameter['full_layout'], - 'allow_scaling': self.parameter['allow_scaling'], - 'headers_off': self.parameter['headers_off'], - 'tables': self.parameter['tables'], - 'override_dpi': self.parameter['dpi'], - 'logger': LOG, - 'pcgts': pcgts, - 'image_filename': image_filename - } - Eynollah(**eynollah_kwargs).run() - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=page_id, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, file_id) + '.xml', - content=to_xml(pcgts)) + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + assert input_pcgts + assert input_pcgts[0] + pcgts = input_pcgts[0] + page = pcgts.get_Page() + # XXX loses DPI information + # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename + Eynollah( + dir_models=self.resolve_resource(self.parameter['models']), + allow_enhancement=False, + curved_line=self.parameter['curved_line'], + full_layout=self.parameter['full_layout'], + allow_scaling=self.parameter['allow_scaling'], + headers_off=self.parameter['headers_off'], + tables=self.parameter['tables'], + override_dpi=self.parameter['dpi'], + logger=self.logger, + pcgts=pcgts, + image_filename=image_filename + ).run() + return OcrdPageResult(pcgts) diff --git a/requirements.txt b/requirements.txt index f01d319..feeea99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # ocrd includes opencv, numpy, shapely, click -ocrd >= 2.23.3 +ocrd >= 3.0.0a2 numpy <1.24.0 scikit-learn >= 0.23.2 tensorflow == 2.12.1 From 4a13781ef49cd964accabb41b583cd4083ce0293 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 23 Aug 2024 18:32:29 +0200 Subject: [PATCH 02/22] class Eynollah: add typing, consistent interface in CLI and OCR-D CLI --- qurator/eynollah/cli.py | 5 ++-- qurator/eynollah/eynollah.py | 58 +++++++++++++++++++----------------- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/qurator/eynollah/cli.py b/qurator/eynollah/cli.py index 822db18..99bf5ac 100644 --- a/qurator/eynollah/cli.py +++ b/qurator/eynollah/cli.py @@ -1,6 +1,6 @@ import sys import click -from ocrd_utils import initLogging, setOverrideLogLevel +from ocrd_utils import getLogger, initLogging, setOverrideLogLevel from qurator.eynollah.eynollah import Eynollah @@ -176,10 +176,11 @@ def main( print('Error: You used -tll to enable light textline detection but -light is not enabled') sys.exit(1) eynollah = Eynollah( + model, + getLogger('Eynollah'), image_filename=image, dir_out=out, dir_in=dir_in, - dir_models=model, dir_of_cropped_images=save_images, dir_of_layout=save_layout, dir_of_deskewed=save_deskewed, diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 7f5561c..f80798b 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -6,14 +6,18 @@ document layout analysis (segmentation) with output in PAGE-XML """ +from logging import Logger import math import os import sys import time +from typing import Optional import warnings from pathlib import Path from multiprocessing import Process, Queue, cpu_count import gc +from PIL.Image import Image +from ocrd import OcrdPage from ocrd_utils import getLogger import cv2 import numpy as np @@ -142,32 +146,32 @@ def get_config(self): class Eynollah: def __init__( self, - dir_models, - image_filename=None, - image_pil=None, - image_filename_stem=None, - dir_out=None, - dir_in=None, - dir_of_cropped_images=None, - dir_of_layout=None, - dir_of_deskewed=None, - dir_of_all=None, - dir_save_page=None, - enable_plotting=False, - allow_enhancement=False, - curved_line=False, - textline_light=False, - full_layout=False, - tables=False, - right2left=False, - input_binary=False, - allow_scaling=False, - headers_off=False, - light_version=False, - ignore_page_extraction=False, - override_dpi=None, - logger=None, - pcgts=None, + dir_models : str, + logger : Logger, + image_filename : Optional[str] = None, + image_pil : Optional[Image] = None, + image_filename_stem : Optional[str] = None, + dir_out : Optional[str] = None, + dir_in : Optional[str] = None, + dir_of_cropped_images : Optional[str] = None, + dir_of_layout : Optional[str] = None, + dir_of_deskewed : Optional[str] = None, + dir_of_all : Optional[str] = None, + dir_save_page : Optional[str] = None, + enable_plotting : bool = False, + allow_enhancement : bool = False, + curved_line : bool = False, + textline_light : bool = False, + full_layout : bool = False, + tables : bool = False, + right2left : bool = False, + input_binary : bool = False, + allow_scaling : bool = False, + headers_off : bool = False, + light_version : bool = False, + ignore_page_extraction : bool = False, + override_dpi : Optional[int] = None, + pcgts : Optional[OcrdPage] = None, ): if not dir_in: if image_pil: @@ -213,7 +217,7 @@ def __init__( curved_line=self.curved_line, textline_light = self.textline_light, pcgts=pcgts) - self.logger = logger if logger else getLogger('eynollah') + self.logger = logger self.dir_models = dir_models self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" From 9ce02a569e49fe21ddff01dc14261b3f0583789f Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 23 Aug 2024 18:32:59 +0200 Subject: [PATCH 03/22] ocrd-tool: add "allow_enhancement" parameter --- qurator/eynollah/ocrd-tool.json | 31 ++++++++++++++++++------------- qurator/eynollah/processor.py | 6 +++--- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json index 8a2cb95..311ac21 100644 --- a/qurator/eynollah/ocrd-tool.json +++ b/qurator/eynollah/ocrd-tool.json @@ -29,11 +29,11 @@ "default": true, "description": "Try to detect all element subtypes, including drop-caps and headings" }, - "tables": { - "type": "boolean", - "default": false, - "description": "Try to detect table regions" - }, + "tables": { + "type": "boolean", + "default": false, + "description": "Try to detect table regions" + }, "curved_line": { "type": "boolean", "default": false, @@ -44,6 +44,11 @@ "default": false, "description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)" }, + "allow_enhancement": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not." + }, "headers_off": { "type": "boolean", "default": false, @@ -51,14 +56,14 @@ } }, "resources": [ - { - "description": "models for eynollah (TensorFlow format)", - "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz", - "name": "default", - "size": 1761991295, - "type": "archive", - "path_in_archive": "models_eynollah" - } + { + "description": "models for eynollah (TensorFlow format)", + "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz", + "name": "default", + "size": 1761991295, + "type": "archive", + "path_in_archive": "models_eynollah" + } ] } } diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index c8748af..304524a 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -20,15 +20,15 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename Eynollah( - dir_models=self.resolve_resource(self.parameter['models']), - allow_enhancement=False, + self.resolve_resource(self.parameter['models']), + self.logger, + allow_enhancement=self.parameter['allow_enhancement'], curved_line=self.parameter['curved_line'], full_layout=self.parameter['full_layout'], allow_scaling=self.parameter['allow_scaling'], headers_off=self.parameter['headers_off'], tables=self.parameter['tables'], override_dpi=self.parameter['dpi'], - logger=self.logger, pcgts=pcgts, image_filename=image_filename ).run() From 0d83db7bc4b18b459b1ae58899bcb25d8d10ada0 Mon Sep 17 00:00:00 2001 From: kba Date: Sat, 24 Aug 2024 16:46:25 +0200 Subject: [PATCH 04/22] update processor to the latest change in bertsky/core#14 --- qurator/eynollah/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 304524a..83fed0e 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -8,7 +8,7 @@ class EynollahProcessor(Processor): @property - def metadata_location(self) -> str: + def metadata_filename(self) -> str: return 'eynollah/ocrd-tool.json' def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: From 87adc4b0c69233f15a637be5841477ad6f905ece Mon Sep 17 00:00:00 2001 From: kba Date: Sat, 24 Aug 2024 16:51:52 +0200 Subject: [PATCH 05/22] ocrd interface: add light_mode parameter --- qurator/eynollah/ocrd-tool.json | 5 +++++ qurator/eynollah/processor.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json index 311ac21..28dd772 100644 --- a/qurator/eynollah/ocrd-tool.json +++ b/qurator/eynollah/ocrd-tool.json @@ -49,6 +49,11 @@ "default": false, "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not." }, + "light mode": { + "type": "boolean", + "default": false, + "description": "lighter and faster but simpler method for main region detection and deskewing" + }, "headers_off": { "type": "boolean", "default": false, diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 83fed0e..65122dd 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -14,6 +14,7 @@ def metadata_filename(self) -> str: def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: assert input_pcgts assert input_pcgts[0] + assert self.parameter pcgts = input_pcgts[0] page = pcgts.get_Page() # XXX loses DPI information @@ -24,6 +25,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional self.logger, allow_enhancement=self.parameter['allow_enhancement'], curved_line=self.parameter['curved_line'], + light_version=self.parameter['light_mode'], full_layout=self.parameter['full_layout'], allow_scaling=self.parameter['allow_scaling'], headers_off=self.parameter['headers_off'], From 39b16e59781d683e0d15ec750b7055d0d5969460 Mon Sep 17 00:00:00 2001 From: kba Date: Sat, 24 Aug 2024 18:00:45 +0200 Subject: [PATCH 06/22] ocrd interface: add textline_light --- qurator/eynollah/ocrd-tool.json | 7 ++++++- qurator/eynollah/processor.py | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json index 28dd772..ef6230c 100644 --- a/qurator/eynollah/ocrd-tool.json +++ b/qurator/eynollah/ocrd-tool.json @@ -49,11 +49,16 @@ "default": false, "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not." }, - "light mode": { + "light_mode": { "type": "boolean", "default": false, "description": "lighter and faster but simpler method for main region detection and deskewing" }, + "textline_light": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method." + }, "headers_off": { "type": "boolean", "default": false, diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 65122dd..c4d3cb2 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -11,6 +11,10 @@ class EynollahProcessor(Processor): def metadata_filename(self) -> str: return 'eynollah/ocrd-tool.json' + def setup(self) -> None: + if self.parameter['textline_light'] and not self.parameter['light_mode']: + raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection but parameter 'light_mode' is not enabled") + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: assert input_pcgts assert input_pcgts[0] @@ -26,6 +30,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional allow_enhancement=self.parameter['allow_enhancement'], curved_line=self.parameter['curved_line'], light_version=self.parameter['light_mode'], + textline_light=self.parameter['textline_light'], full_layout=self.parameter['full_layout'], allow_scaling=self.parameter['allow_scaling'], headers_off=self.parameter['headers_off'], From ddcc0198bdf0e16f649cc671ef0d25f38614a784 Mon Sep 17 00:00:00 2001 From: kba Date: Sat, 24 Aug 2024 18:05:21 +0200 Subject: [PATCH 07/22] ocrd interface: add right_to_left --- qurator/eynollah/ocrd-tool.json | 5 +++++ qurator/eynollah/processor.py | 1 + 2 files changed, 6 insertions(+) diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json index ef6230c..02a2a23 100644 --- a/qurator/eynollah/ocrd-tool.json +++ b/qurator/eynollah/ocrd-tool.json @@ -59,6 +59,11 @@ "default": false, "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method." }, + "right_to_left": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool will extract right-to-left reading order." + }, "headers_off": { "type": "boolean", "default": false, diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index c4d3cb2..d1bc44a 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -30,6 +30,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional allow_enhancement=self.parameter['allow_enhancement'], curved_line=self.parameter['curved_line'], light_version=self.parameter['light_mode'], + right2left=self.parameter['right_to_left'], textline_light=self.parameter['textline_light'], full_layout=self.parameter['full_layout'], allow_scaling=self.parameter['allow_scaling'], From d7caeb2b05a65b9747343d31b42f723f8f11db6e Mon Sep 17 00:00:00 2001 From: kba Date: Sat, 24 Aug 2024 18:11:15 +0200 Subject: [PATCH 08/22] ocrd interface: add ignore_page_extraction --- qurator/eynollah/ocrd-tool.json | 5 +++++ qurator/eynollah/processor.py | 1 + 2 files changed, 6 insertions(+) diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json index 02a2a23..127b95b 100644 --- a/qurator/eynollah/ocrd-tool.json +++ b/qurator/eynollah/ocrd-tool.json @@ -39,6 +39,11 @@ "default": false, "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time" }, + "ignore_page_extraction": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool would ignore page extraction" + }, "allow_scaling": { "type": "boolean", "default": false, diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index d1bc44a..9fcf2d5 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -31,6 +31,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional curved_line=self.parameter['curved_line'], light_version=self.parameter['light_mode'], right2left=self.parameter['right_to_left'], + ignore_page_extraction=self.parameter['ignore_page_extraction'], textline_light=self.parameter['textline_light'], full_layout=self.parameter['full_layout'], allow_scaling=self.parameter['allow_scaling'], From 8dfecb70d4cdd3364bd64e8048275f4840d935ae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 19 Jan 2024 16:17:02 +0000 Subject: [PATCH 09/22] adapt to ocrd>=2.54 url vs local_filename # Conflicts: # qurator/eynollah/processor.py --- qurator/eynollah/processor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 9fcf2d5..488715d 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -23,7 +23,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page = pcgts.get_Page() # XXX loses DPI information # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename + if not('://' in page.imageFilename): + image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename + else: + # could be a URL with file:// or truly remote + image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename Eynollah( self.resolve_resource(self.parameter['models']), self.logger, From 3381e5a01561d08ca10ad253fba27779453e0982 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 24 Jan 2024 19:33:49 +0100 Subject: [PATCH 10/22] adapt to OcrdFile.local_filename now :Path # Conflicts: # qurator/eynollah/processor.py --- qurator/eynollah/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 488715d..92a91c2 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -43,6 +43,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional tables=self.parameter['tables'], override_dpi=self.parameter['dpi'], pcgts=pcgts, - image_filename=image_filename + image_filename=str(image_filename) ).run() return OcrdPageResult(pcgts) From 49c1a8f38478715395fdaa10f953c3eaee41df5a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 May 2024 14:29:57 +0000 Subject: [PATCH 11/22] fix namespace pkg setup From c37d95dedfac320b4e6f40880f1dc04e9ee7e0df Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 23 May 2024 21:19:33 +0200 Subject: [PATCH 12/22] non-legacy namespace package # Conflicts: # setup.py From 61bcb435ae57c6194b64df3d9678bcce811712e6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 11 Jun 2023 22:14:41 +0200 Subject: [PATCH 13/22] processor: reuse loaded models across pages, use derived images # Conflicts: # qurator/eynollah/processor.py --- qurator/eynollah/processor.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 92a91c2..ea144e4 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -12,6 +12,8 @@ def metadata_filename(self) -> str: return 'eynollah/ocrd-tool.json' def setup(self) -> None: + # for caching models + self.models = None if self.parameter['textline_light'] and not self.parameter['light_mode']: raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection but parameter 'light_mode' is not enabled") @@ -21,14 +23,19 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional assert self.parameter pcgts = input_pcgts[0] page = pcgts.get_Page() + # if not('://' in page.imageFilename): + # image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename + # else: + # # could be a URL with file:// or truly remote + # image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename # XXX loses DPI information - # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - if not('://' in page.imageFilename): - image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename - else: - # could be a URL with file:// or truly remote - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename - Eynollah( + page_image, _, _ = self.workspace.image_from_page( + page, page_id, + # avoid any features that would change the coordinate system: cropped,deskewed + # (the PAGE builder merely adds regions, so afterwards we would not know which to transform) + # also avoid binarization as models usually fare better on grayscale/RGB + feature_filter='cropped,deskewed,binarized') + eynollah = Eynollah( self.resolve_resource(self.parameter['models']), self.logger, allow_enhancement=self.parameter['allow_enhancement'], @@ -43,6 +50,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional tables=self.parameter['tables'], override_dpi=self.parameter['dpi'], pcgts=pcgts, - image_filename=str(image_filename) - ).run() + image_filename=page.imageFilename, + image_pil=page_image + ) + if self.models is not None: + # reuse loaded models from previous page + eynollah.models = self.models + eynollah.run() + self.models = eynollah.models return OcrdPageResult(pcgts) From d98fa2a85b7411338ee102039503e4dc142eb068 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 28 May 2024 14:07:45 +0200 Subject: [PATCH 14/22] check_dpi: fix Pillow type detection From ecd202ea4c57ed09c78f4880f4592b435d36ed3e Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 26 Aug 2024 10:39:22 +0200 Subject: [PATCH 15/22] processor.py: Simplify import Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- qurator/eynollah/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index ea144e4..e163ecd 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -1,5 +1,5 @@ from typing import Optional -from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd import OcrdPageResult from ocrd_models import OcrdPage from ocrd import Processor From d26079db850cafe41225b449408b337a605e32ab Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 26 Aug 2024 10:40:15 +0200 Subject: [PATCH 16/22] procesor.py: simplify imports further --- qurator/eynollah/processor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index e163ecd..2a383d8 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -1,7 +1,6 @@ from typing import Optional -from ocrd import OcrdPageResult from ocrd_models import OcrdPage -from ocrd import Processor +from ocrd import Processor, OcrdPageResult from .eynollah import Eynollah From 7b92620a104d5ff5f72c2a1755466eab5bc05843 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 26 Aug 2024 10:45:53 +0200 Subject: [PATCH 17/22] processor: no more DPI info lost Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- qurator/eynollah/processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 2a383d8..01dd797 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -27,7 +27,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # else: # # could be a URL with file:// or truly remote # image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename - # XXX loses DPI information page_image, _, _ = self.workspace.image_from_page( page, page_id, # avoid any features that would change the coordinate system: cropped,deskewed From aef46a4669fa3c34b5df17ded284d072f32d5a46 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 26 Aug 2024 11:31:13 +0200 Subject: [PATCH 18/22] require ocrd >= 3.0.0b1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index feeea99..edfbe76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # ocrd includes opencv, numpy, shapely, click -ocrd >= 3.0.0a2 +ocrd >= 3.0.0b1 numpy <1.24.0 scikit-learn >= 0.23.2 tensorflow == 2.12.1 From dfc4ac2538654ef446beb69652ea64543db2cc93 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 22:46:51 +0200 Subject: [PATCH 19/22] setuptools: fix (packages.find.where prevented finding namespace qurator) --- pyproject.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8f83249..9e610c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,8 +37,5 @@ Repository = "https://github.com/qurator-spk/eynollah.git" [tool.setuptools.dynamic] dependencies = {file = ["requirements.txt"]} -[tool.setuptools.packages.find] -where = ["qurator"] - [tool.setuptools.package-data] "*" = ["*.json", '*.yml', '*.xml', '*.xsd'] From 1e902571ead1b8493376e2ca7d1dc401aefd929d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 10:15:11 +0200 Subject: [PATCH 20/22] undo customizing metadata_filename (not correct with namespace pkg support in core) --- qurator/eynollah/processor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 01dd797..fd7dd2a 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -6,10 +6,6 @@ class EynollahProcessor(Processor): - @property - def metadata_filename(self) -> str: - return 'eynollah/ocrd-tool.json' - def setup(self) -> None: # for caching models self.models = None From 17eafc1ccb3980f2bedb5183d45942fc83a838ba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 10:15:31 +0200 Subject: [PATCH 21/22] adapt tool json to v3 --- qurator/eynollah/ocrd-tool.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json index 127b95b..2da970d 100644 --- a/qurator/eynollah/ocrd-tool.json +++ b/qurator/eynollah/ocrd-tool.json @@ -6,8 +6,8 @@ "executable": "ocrd-eynollah-segment", "categories": ["Layout analysis"], "description": "Segment page into regions and lines and do reading order detection with eynollah", - "input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"], - "output_file_grp": ["OCR-D-SEG-LINE"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/region", "layout/segmentation/line"], "parameters": { "models": { From fdedae24066e5a99a3c4584d9c583b9318c0077b Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 2 Sep 2024 11:47:57 +0200 Subject: [PATCH 22/22] require ocrd>=3.0.0b4 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index edfbe76..30d4c51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # ocrd includes opencv, numpy, shapely, click -ocrd >= 3.0.0b1 +ocrd >= 3.0.0b4 numpy <1.24.0 scikit-learn >= 0.23.2 tensorflow == 2.12.1