Merge pull request #66 from OCR-D/file-ids-and-such

one output_file_grp, make_file_id, assert_file_grp_cardinality
OCR-D · Aug 21, 2020 · 97568ce · 97568ce
2 parents 962617d + 282209a
commit 97568ce
Show file tree

Hide file tree

Showing 13 changed files with 174 additions and 171 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -51,7 +51,7 @@ jobs:
        # https://nose.readthedocs.io
 
       - run: . venv/bin/activate; make test
-      - run: . venv/bin/activate; make cli-test
+      - run: . venv/bin/activate; pip install .; make cli-test
 
       - store_artifacts:
           path: test-reports

diff --git a/.gitignore b/.gitignore
@@ -28,3 +28,4 @@ lib64
 *.png
 *.jpg
 ocrd_anybaseocr/pix2pixhd
+/models
diff --git a/Makefile b/Makefile
@@ -17,6 +17,8 @@ TESTS=tests
 # Tag to publish docker image to
 DOCKER_TAG = ocrd/anybaseocr
 
+MODELS = $(PWD)/models
+
 # BEGIN-EVAL makefile-parser --make-help Makefile
 
 help:
@@ -58,6 +60,7 @@ install: patch-pix2pixhd
 # Patch pix2pixhd to trick it into thinking it was part of this mess
 PIX2PIX_FILES = ocrd_anybaseocr/pix2pixhd/*/*.py ocrd_anybaseocr/pix2pixhd/*.py
 patch-pix2pixhd: pix2pixhd
+	touch ocrd_anybaseocr/pix2pixhd/__init__.py
 	sed -i 's,^from util,from ..util,' $(PIX2PIX_FILES)
 	sed -i 's,^import util,import ..util,' $(PIX2PIX_FILES)
 	sed -i 's,^\(\s*\)from data,\1from .data,' ocrd_anybaseocr/pix2pixhd/*.py
@@ -127,31 +130,31 @@ cli-test: assets-clean assets \
 
 # Test binarization CLI
 test-binarize:
-	cd $(TESTDATA) && ocrd-anybaseocr-binarize -m mets.xml -I OCR-D-IMG -O OCR-D-IMG-BIN-TEST
+	cd $(TESTDATA) && ocrd-anybaseocr-binarize -m mets.xml -I MAX -O BIN-TEST
 
 # Test deskewing CLI
 test-deskew:
-	cd $(TESTDATA) && ocrd-anybaseocr-deskew -m mets.xml -I OCR-D-IMG-BIN-TEST -O OCR-D-IMG-DESKEW-TEST
+	cd $(TESTDATA) && ocrd-anybaseocr-deskew -m mets.xml -I BIN-TEST -O DESKEW-TEST
 
 # Test cropping CLI
 test-crop:
-	cd $(TESTDATA) && ocrd-anybaseocr-crop -m mets.xml -I OCR-D-IMG-DESKEW-TEST -O OCR-D-IMG-CROP-TEST
+	cd $(TESTDATA) && ocrd-anybaseocr-crop -m mets.xml -I DESKEW-TEST -O CROP-TEST
 
 # Test text/non-text segmentation CLI
 test-tiseg:
-	cd $(TESTDATA) && ocrd-anybaseocr-tiseg -m mets.xml -I OCR-D-IMG-CROP-TEST -O OCR-D-IMG-TISEG-TEST
+	cd $(TESTDATA) && ocrd-anybaseocr-tiseg -m mets.xml -I CROP-TEST -O TISEG-TEST -P seg_weights seg_model.hdf5
 
 # Test block segmentation CLI
 test-block-segmentation:
-	cd $(TESTDATA) && ocrd-anybaseocr-block-segmentation -m mets.xml -I OCR-D-IMG-TISEG-TEST -O OCR-D-BLOCK-SEGMENT
+	cd $(TESTDATA) && ocrd-anybaseocr-block-segmentation -m mets.xml -I TISEG-TEST -O OCR-D-BLOCK-SEGMENT -P block_segmentation_weights block_segmentation_weights.h5
 
 # Test textline extraction CLI
 test-textline:
-	cd $(TESTDATA) && ocrd-anybaseocr-textline -m mets.xml -I OCR-D-BLOCK-SEGMENT -O OCR-D-IMG-TL-TEST
+	cd $(TESTDATA) && ocrd-anybaseocr-textline -m mets.xml -I OCR-D-BLOCK-SEGMENT -O TL-TEST
 
 # Test document structure analysis CLI
 test-layout-analysis:
 	cd $(TESTDATA) && ocrd-anybaseocr-layout-analysis -m mets.xml \
-		-I OCR-D-IMG-BIN-TEST -O OCR-D-IMG-LAYOUT \
-		-P model_path models/structure_analysis.h5 \
-		-P class_mapping_path models/mapping_densenet.pickle
+		-I BIN-TEST -O LAYOUT \
+		-P model_path structure_analysis.h5 \
+		-P class_mapping_path mapping_densenet.pickle
diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_binarize.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_binarize.py
@@ -58,7 +58,12 @@
     MetadataItemType,
     LabelsType, LabelType
     )
-from ocrd_utils import getLogger, concat_padded, MIMETYPE_PAGE
+from ocrd_utils import (
+    getLogger,
+    MIMETYPE_PAGE,
+    make_file_id,
+    assert_file_grp_cardinality
+)
 
 # Ignore zoom warning from interpolation
 import warnings
@@ -100,19 +105,17 @@ def dshow(self, image, info):
         ginput(1, self.parameter['debug'])
 
     def process(self):
-        try:
-            page_grp, self.image_grp = self.output_file_grp.split(',')
-        except ValueError:
-            page_grp = self.output_file_grp
-            self.image_grp = FALLBACK_IMAGE_GRP
-            LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP)
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
         oplevel = self.parameter['operation_level']
 
         for (n, input_file) in enumerate(self.input_files):            
+            file_id = make_file_id(input_file, self.output_file_grp)
             page_id = input_file.pageId or input_file.ID
 
             LOG.info("INPUT FILE %i / %s", n, page_id)
             pcgts = page_from_file(self.workspace.download_file(input_file))
+            pcgts.set_pcGtsId(file_id)
             metadata = pcgts.get_Metadata()
             metadata.add_MetadataItem(
                     MetadataItemType(type_="processingStep",
@@ -138,18 +141,12 @@ def process(self):
                     # TODO: not tested on regions
                     self._process_segment(region_image, page, region_xywh, region.id, input_file, str(n)+"_"+str(k))
 
-            # Use input_file's basename for the new file -
-            # this way the files retain the same basenames:
-            file_id = input_file.ID.replace(self.input_file_grp, page_grp)            
-            if file_id == input_file.ID:
-                file_id = concat_padded(page_grp, n)          
             self.workspace.add_file(
                 ID=file_id,
-                file_grp=page_grp,
+                file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=MIMETYPE_PAGE,
-                local_filename=os.path.join(page_grp,
-                                        file_id + '.xml'),
+                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                 content=to_xml(pcgts).encode('utf-8')
             )
 
@@ -244,13 +241,11 @@ def _process_segment(self,page_image, page, page_xywh, page_id, input_file, n):
         bin_array = array(255*(binarized>ocrolib.midrange(binarized)),'B')
         bin_image = ocrolib.array2pil(bin_array)                            
 
-        file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
-        if file_id == input_file.ID:
-            file_id = concat_padded(self.image_grp, n)
+        file_id = make_file_id(input_file, self.output_file_grp)
         file_path = self.workspace.save_image_file(bin_image,
-                                   file_id,
+                                   file_id + '-IMG',
                                    page_id=page_id,
-                                   file_grp=self.image_grp
+                                   file_grp=self.output_file_grp
             )     
         page.add_AlternativeImage(AlternativeImageType(filename=file_path, comments=page_xywh['features']))
 

diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_block_segmentation.py
@@ -1,10 +1,9 @@
-# pylint: missing-module-docstring, missing-class-docstring, invalid-name
+# pylint: disable=missing-module-docstring, missing-class-docstring, invalid-name
 # pylint: disable=line-too-long, import-error, no-name-in-module, too-many-statements
 # pylint: disable=wrong-import-position, wrong-import-order, too-many-locals, too-few-public-methods
 import sys
 import os
 from pathlib import Path
-import warnings
 import click
 
 import cv2
@@ -18,7 +17,8 @@
 from ocrd_modelfactory import page_from_file
 from ocrd_utils import (
     getLogger,
-    concat_padded,
+    make_file_id,
+    assert_file_grp_cardinality,
     MIMETYPE_PAGE,
     coordinates_for_segment,
     points_from_polygon,
@@ -73,14 +73,11 @@ def __init__(self, *args, **kwargs):
 
     def process(self):
 
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
         if not tf.test.is_gpu_available():
             LOG.warning("Tensorflow cannot detect CUDA installation. Running without GPU will be slow.")
-        try:
-            page_grp, self.image_grp = self.output_file_grp.split(',')
-        except ValueError:
-            page_grp = self.output_file_grp
-            self.image_grp = FALLBACK_IMAGE_GRP
-            LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP)
 
         model_path = Path(self.parameter['block_segmentation_model'])
         model_weights = Path(self.parameter['block_segmentation_weights'])
@@ -139,19 +136,15 @@ def process(self):
             else:
                 LOG.warning('Operation level %s, but should be "page".', oplevel)
                 break
-            file_id = input_file.ID.replace(self.input_file_grp, page_grp)
 
-            # Use input_file's basename for the new file -
-            # this way the files retain the same basenames:
-            if file_id == input_file.ID:
-                file_id = concat_padded(page_grp, n)
+            file_id = make_file_id(input_file, self.output_file_grp)
+            pcgts.set_pcGtsId(file_id)
             self.workspace.add_file(
                 ID=file_id,
-                file_grp=page_grp,
+                file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=MIMETYPE_PAGE,
-                local_filename=os.path.join(self.output_file_grp,
-                                            file_id + '.xml'),
+                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                 content=to_xml(pcgts).encode('utf-8')
             )
 
@@ -161,7 +154,7 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n,
         if page.get_TextRegion():
             if self.parameter['overwrite']:
                 LOG.info('removing existing TextRegions in page "%s"', page_id)
-                textregion.set_TextRegion([])
+                page.set_TextRegion([])
             else:
                 LOG.warning('keeping existing TextRegions in page "%s"', page_id)
                 return
@@ -366,14 +359,11 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n,
 
             region_img = ocrolib.array2pil(region_img)
 
-            file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.image_grp, n)
-
+            file_id = make_file_id(input_file, self.output_file_grp)
             file_path = self.workspace.save_image_file(region_img,
                                                        file_id+"_"+str(i),
                                                        page_id=page_id,
-                                                       file_grp=self.image_grp)
+                                                       file_grp=self.output_file_grp)
 
             # ai = AlternativeImageType(filename=file_path, comments=page_xywh['features'])
             region_id = '%s_region%04d' % (page_id, i)

diff --git a/ocrd_anybaseocr/cli/ocrd_anybaseocr_cropping.py b/ocrd_anybaseocr/cli/ocrd_anybaseocr_cropping.py
@@ -1,3 +1,4 @@
+# pylint: disable=invalid-name
 # ======================================================================
 # ====================================
 # README file for Page Cropping component
@@ -47,9 +48,12 @@
 from ocrd_utils import (
     getLogger,
     crop_image,
+    make_file_id,
+    assert_file_grp_cardinality,
     concat_padded, 
     MIMETYPE_PAGE, 
     coordinates_for_segment,
+    bbox_from_points,
     points_from_polygon
 )
 from ocrd_models.ocrd_page import (
@@ -64,8 +68,6 @@
 TOOL = 'ocrd-anybaseocr-crop'
 
 LOG = getLogger('OcrdAnybaseocrCropper')
-FALLBACK_IMAGE_GRP = 'OCR-D-IMG-CROP'
-
 
 class OcrdAnybaseocrCropper(Processor):
 
@@ -420,14 +422,9 @@ def crop_area(self, textarea, binImg, rgb, colSeparator):
 
     def process(self):
         """Performs border detection on the workspace. """
-        try:
-            LOG.info("OUTPUT FILE %s", self.output_file_grp)
-            page_grp, self.image_grp = self.output_file_grp.split(',')
-        except ValueError:
-            page_grp = self.output_file_grp
-            self.image_grp = FALLBACK_IMAGE_GRP
-            LOG.info(
-                "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP)
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
         oplevel = self.parameter['operation_level']
         for (n, input_file) in enumerate(self.input_files):
             page_id = input_file.pageId or input_file.ID
@@ -465,20 +462,14 @@ def process(self):
             else:
                 raise Exception(
                     'Operation level %s, but should be "page".', oplevel)
-            file_id = input_file.ID.replace(
-                self.input_file_grp, page_grp)
-
-            # Use input_file's basename for the new file -
-            # this way the files retain the same basenames:
-            if file_id == input_file.ID:
-                file_id = concat_padded(page_grp, n)
+            file_id = make_file_id(input_file, self.output_file_grp)
+            pcgts.set_pcGtsId(file_id)
             self.workspace.add_file(
                 ID=file_id,
-                file_grp=page_grp,
+                file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=MIMETYPE_PAGE,
-                local_filename=os.path.join(page_grp,
-                                            file_id + '.xml'),
+                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                 content=to_xml(pcgts).encode('utf-8')
             )
 
@@ -530,14 +521,12 @@ def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n):
         page_image = crop_image(page_image, box=(min_x, min_y, max_x, max_y))
         page_xywh['features'] += ',cropped'
 
-        file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
-        if file_id == input_file.ID:
-            file_id = concat_padded(self.image_grp, n)
+        file_id = make_file_id(input_file, self.output_file_grp)
 
         file_path = self.workspace.save_image_file(page_image,
-                                                   file_id,
+                                                   file_id + '-IMG',
                                                    page_id=page_id,
-                                                   file_grp=self.image_grp)
+                                                   file_grp=self.output_file_grp)
         page.add_AlternativeImage(AlternativeImageType(
             filename=file_path, comments=page_xywh['features']))
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,3 +28,4 @@ lib64 @@
     *.png
     *.jpg
     ocrd_anybaseocr/pix2pixhd
+    /models