qurator-spk · kba · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 24, 2024
diff --git a/qurator/eynollah/cli.py b/qurator/eynollah/cli.py
@@ -1,6 +1,6 @@
 import sys
 import click
-from ocrd_utils import initLogging, setOverrideLogLevel
+from ocrd_utils import getLogger, initLogging, setOverrideLogLevel
 from qurator.eynollah.eynollah import Eynollah
 
 
@@ -176,10 +176,11 @@ def main(
         print('Error: You used -tll to enable light textline detection but -light is not enabled')
         sys.exit(1)
     eynollah = Eynollah(
+        model,
+        getLogger('Eynollah'),
         image_filename=image,
         dir_out=out,
         dir_in=dir_in,
-        dir_models=model,
         dir_of_cropped_images=save_images,
         dir_of_layout=save_layout,
         dir_of_deskewed=save_deskewed,

diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py
@@ -6,14 +6,18 @@
 document layout analysis (segmentation) with output in PAGE-XML
 """
 
+from logging import Logger
 import math
 import os
 import sys
 import time
+from typing import Optional
 import warnings
 from pathlib import Path
 from multiprocessing import Process, Queue, cpu_count
 import gc
+from PIL.Image import Image
+from ocrd import OcrdPage
 from ocrd_utils import getLogger
 import cv2
 import numpy as np
@@ -142,32 +146,32 @@ def get_config(self):
 class Eynollah:
     def __init__(
         self,
-        dir_models,
-        image_filename=None,
-        image_pil=None,
-        image_filename_stem=None,
-        dir_out=None,
-        dir_in=None,
-        dir_of_cropped_images=None,
-        dir_of_layout=None,
-        dir_of_deskewed=None,
-        dir_of_all=None,
-        dir_save_page=None,
-        enable_plotting=False,
-        allow_enhancement=False,
-        curved_line=False,
-        textline_light=False,
-        full_layout=False,
-        tables=False,
-        right2left=False,
-        input_binary=False,
-        allow_scaling=False,
-        headers_off=False,
-        light_version=False,
-        ignore_page_extraction=False,
-        override_dpi=None,
-        logger=None,
-        pcgts=None,
+        dir_models : str,
+        logger : Logger,
+        image_filename : Optional[str] = None,
+        image_pil : Optional[Image] = None,
+        image_filename_stem : Optional[str] = None,
+        dir_out : Optional[str] = None,
+        dir_in : Optional[str] = None,
+        dir_of_cropped_images : Optional[str] = None,
+        dir_of_layout : Optional[str] = None,
+        dir_of_deskewed : Optional[str] = None,
+        dir_of_all : Optional[str] = None,
+        dir_save_page : Optional[str] = None,
+        enable_plotting : bool = False,
+        allow_enhancement : bool = False,
+        curved_line : bool = False,
+        textline_light : bool = False,
+        full_layout : bool = False,
+        tables : bool = False,
+        right2left : bool = False,
+        input_binary : bool = False,
+        allow_scaling : bool = False,
+        headers_off : bool = False,
+        light_version : bool = False,
+        ignore_page_extraction : bool = False,
+        override_dpi : Optional[int] = None,
+        pcgts : Optional[OcrdPage] = None,
     ):
         if not dir_in:
             if image_pil:
@@ -213,7 +217,7 @@ def __init__(
                 curved_line=self.curved_line,
                 textline_light = self.textline_light,
                 pcgts=pcgts)
-        self.logger = logger if logger else getLogger('eynollah')
+        self.logger = logger
         self.dir_models = dir_models
 
         self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425"

diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json
@@ -29,36 +29,61 @@
           "default": true,
           "description": "Try to detect all element subtypes, including drop-caps and headings"
         },
-	"tables": {
-	  "type": "boolean",
-	  "default": false,
-	  "description": "Try to detect table regions"
-	},
+        "tables": {
+          "type": "boolean",
+          "default": false,
+          "description": "Try to detect table regions"
+        },
         "curved_line": {
           "type": "boolean",
           "default": false,
           "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time"
         },
+        "ignore_page_extraction": {
+          "type": "boolean",
+          "default": false,
+          "description": "if this parameter set to true, this tool would ignore page extraction"
+        },
         "allow_scaling": {
           "type": "boolean",
           "default": false,
           "description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)"
         },
+        "allow_enhancement": {
+          "type": "boolean",
+          "default": false,
+          "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not."
+        },
+        "light_mode": {
+          "type": "boolean",
+          "default": false,
+          "description": "lighter and faster but simpler method for main region detection and deskewing"
+        },
+        "textline_light": {
+          "type": "boolean",
+          "default": false,
+          "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectangle bounding box of textline with a faster method."
+        },
+        "right_to_left": {
+          "type": "boolean",
+          "default": false,
+          "description": "if this parameter set to true, this tool will extract right-to-left reading order."
+        },
         "headers_off": {
           "type": "boolean",
           "default": false,
           "description": "ignore the special role of headings during reading order detection"
         }
       },
       "resources": [
-	{
-	  "description": "models for eynollah (TensorFlow format)",
-	  "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz",
-	  "name": "default",
-	  "size": 1761991295,
-	  "type": "archive",
-	  "path_in_archive": "models_eynollah"
-	}
+        {
+          "description": "models for eynollah (TensorFlow format)",
+          "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz",
+          "name": "default",
+          "size": 1761991295,
+          "type": "archive",
+          "path_in_archive": "models_eynollah"
+        }
       ]
     }
   }

diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py
@@ -1,68 +1,61 @@
-from json import loads
-from pkg_resources import resource_string
-from tempfile import NamedTemporaryFile
-from pathlib import Path
-from os.path import join
-
-from PIL import Image
-
+from typing import Optional
+from ocrd.processor.ocrd_page_result import OcrdPageResult
+from ocrd_models import OcrdPage
 from ocrd import Processor
-from ocrd_modelfactory import page_from_file, exif_from_filename
-from ocrd_models import OcrdFile, OcrdExif
-from ocrd_models.ocrd_page import to_xml
-from ocrd_utils import (
-    getLogger,
-    MIMETYPE_PAGE,
-    assert_file_grp_cardinality,
-    make_file_id
-)
 
 from .eynollah import Eynollah
-from .utils.pil_cv2 import pil2cv
-
-OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
 
 class EynollahProcessor(Processor):
 
-    def __init__(self, *args, **kwargs):
-        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment']
-        kwargs['version'] = OCRD_TOOL['version']
-        super().__init__(*args, **kwargs)
+    @property
+    def metadata_filename(self) -> str:
+        return 'eynollah/ocrd-tool.json'
+
+    def setup(self) -> None:
+        # for caching models
+        self.models = None
+        if self.parameter['textline_light'] and not self.parameter['light_mode']:
+            raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection but parameter 'light_mode' is not enabled")
 
-    def process(self):
-        LOG = getLogger('eynollah')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
-        for n, input_file in enumerate(self.input_files):
-            page_id = input_file.pageId or input_file.ID
-            LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files))
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight)
-            self.add_metadata(pcgts)
-            page = pcgts.get_Page()
-            # XXX loses DPI information
-            # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
-            image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename
-            eynollah_kwargs = {
-                'dir_models': self.resolve_resource(self.parameter['models']),
-                'allow_enhancement': False,
-                'curved_line': self.parameter['curved_line'],
-                'full_layout': self.parameter['full_layout'],
-                'allow_scaling': self.parameter['allow_scaling'],
-                'headers_off': self.parameter['headers_off'],
-                'tables': self.parameter['tables'],
-                'override_dpi': self.parameter['dpi'],
-                'logger': LOG,
-                'pcgts': pcgts,
-                'image_filename': image_filename
-                }
-            Eynollah(**eynollah_kwargs).run()
-            file_id = make_file_id(input_file, self.output_file_grp)
-            pcgts.set_pcGtsId(file_id)
-            self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=page_id,
-                mimetype=MIMETYPE_PAGE,
-                local_filename=join(self.output_file_grp, file_id) + '.xml',
-                content=to_xml(pcgts))
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
+        assert input_pcgts
+        assert input_pcgts[0]
+        assert self.parameter
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+        # if not('://' in page.imageFilename):
+        #     image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename
+        # else:
+        #     # could be a URL with file:// or truly remote
+        #     image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename
-        # if not('://' in page.imageFilename):
-        #     image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename
-        # else:
-        #     # could be a URL with file:// or truly remote
-        #     image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename
-        # if not('://' in page.imageFilename):
-        #     image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename
-        # else:
-        #     # could be a URL with file:// or truly remote
-        #     image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename
+        # XXX loses DPI information
+        page_image, _, _ = self.workspace.image_from_page(
+            page, page_id,
+            # avoid any features that would change the coordinate system: cropped,deskewed
+            # (the PAGE builder merely adds regions, so afterwards we would not know which to transform)
+            # also avoid binarization as models usually fare better on grayscale/RGB
+            feature_filter='cropped,deskewed,binarized')
+        eynollah = Eynollah(
+            self.resolve_resource(self.parameter['models']),
+            self.logger,
+            allow_enhancement=self.parameter['allow_enhancement'],
+            curved_line=self.parameter['curved_line'],
+            light_version=self.parameter['light_mode'],
+            right2left=self.parameter['right_to_left'],
+            ignore_page_extraction=self.parameter['ignore_page_extraction'],
+            textline_light=self.parameter['textline_light'],
+            full_layout=self.parameter['full_layout'],
+            allow_scaling=self.parameter['allow_scaling'],
+            headers_off=self.parameter['headers_off'],
+            tables=self.parameter['tables'],
+            override_dpi=self.parameter['dpi'],
+            pcgts=pcgts,
+            image_filename=page.imageFilename,
+            image_pil=page_image
+        )
+        if self.models is not None:
+            # reuse loaded models from previous page
+            eynollah.models = self.models
+        eynollah.run()
+        self.models = eynollah.models
+        return OcrdPageResult(pcgts)
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 # ocrd includes opencv, numpy, shapely, click
-ocrd >= 2.23.3
+ocrd >= 3.0.0a2
 numpy <1.24.0
 scikit-learn >= 0.23.2
 tensorflow == 2.12.1