feat: implement command line for running OCR over screenshot image files

Includes config for building executable using PyOxidizer.
cofiem · Feb 19, 2023 · ea48ba6 · ea48ba6
1 parent 1c95807
commit ea48ba6
Show file tree

Hide file tree

Showing 10 changed files with 438 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# idea
+.idea/
diff --git a/README.md b/README.md
@@ -1,2 +1,3 @@
 # screenshot-ocr
+
 Extract text from screenshots.
diff --git a/pyoxidizer.bzl b/pyoxidizer.bzl
@@ -0,0 +1,82 @@
+def make_exe():
+    dist = default_python_distribution()
+    policy = dist.make_python_packaging_policy()
+    python_config = dist.make_python_interpreter_config()
+
+    # Run a Python module as __main__ when the interpreter starts.
+    python_config.run_module = "screenshot_ocr.main"
+
+    # Produce a PythonExecutable from a Python distribution, embedded
+    # resources, and other options. The returned object represents the
+    # standalone executable that will be built.
+    exe = dist.to_python_executable(
+        name="screenshot-ocr",
+
+        # If no argument passed, the default `PythonPackagingPolicy` for the
+        # distribution is used.
+        packaging_policy=policy,
+
+        # If no argument passed, the default `PythonInterpreterConfig` is used.
+        config=python_config,
+    )
+
+    # Read Python files from a local directory and add them to our embedded
+    # context, taking just the resources belonging to the `foo` and `bar`
+    # Python packages.
+    exe.add_python_resources(exe.read_package_root(
+        path=".",
+        packages=["screenshot_ocr"],
+    ))
+
+    # Return our `PythonExecutable` instance so it can be built and
+    # referenced by other consumers of this target.
+    return exe
+
+def make_embedded_resources(exe):
+    return exe.to_embedded_resources()
+
+def make_install(exe):
+    # Create an object that represents our installed application file layout.
+    files = FileManifest()
+
+    # Add the generated executable to our install layout in the root directory.
+    files.add_python_resource("screenshot-ocr", exe)
+
+    return files
+
+def make_msi(exe):
+    # See the full docs for more. But this will convert your Python executable
+    # into a `WiXMSIBuilder` Starlark type, which will be converted to a Windows
+    # .msi installer when it is built.
+    return exe.to_wix_msi_builder(
+        # Simple identifier of your app.
+        "screenshot-ocr",
+        # The name of your application.
+        "Screenshot OCR",
+        # The version of your application.
+        "0.1.0",
+        # The author/manufacturer of your application.
+        "Mark C"
+    )
+
+
+# Dynamically enable automatic code signing.
+def register_code_signers():
+    # You will need to run with `pyoxidizer build --var ENABLE_CODE_SIGNING 1` for
+    # this if block to be evaluated.
+    if not VARS.get("ENABLE_CODE_SIGNING"):
+        return
+
+
+# Call our function to set up automatic code signers.
+register_code_signers()
+
+# Tell PyOxidizer about the build targets defined above.
+register_target("exe", make_exe)
+register_target("resources", make_embedded_resources, depends=["exe"], default_build_script=True)
+register_target("install", make_install, depends=["exe"], default=True)
+register_target("msi_installer", make_msi, depends=["exe"])
+
+# Resolve whatever targets the invoker of this configuration file is requesting
+# be resolved.
+resolve_targets()
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,2 @@
+black
+pyoxidizer
diff --git a/screenshot_ocr/__init__.py b/screenshot_ocr/__init__.py
diff --git a/screenshot_ocr/cli.py b/screenshot_ocr/cli.py
@@ -0,0 +1,149 @@
+import argparse
+import logging
+import pathlib
+import shutil
+import sys
+import typing
+
+from screenshot_ocr import tesseract, files
+
+logger = logging.getLogger(__name__)
+
+
+def build_args(args: list[str] = None) -> argparse.Namespace:
+    # prog is set for pyOxidizer, due to issue: https://github.com/indygreg/PyOxidizer/issues/307
+    parser = argparse.ArgumentParser(
+        description="Extract text from screenshots.", prog="screenshot-ocr"
+    )
+    parser.add_argument(
+        "--tesseract-exe",
+        type=pathlib.Path,
+        help="path to the Tesseract executable file",
+    )
+    parser.add_argument(
+        "--tesseract-data",
+        type=pathlib.Path,
+        help="path to the Tesseract data directory",
+    )
+    parser.add_argument(
+        "--input-dir",
+        type=pathlib.Path,
+        help="path to the folder containing the input images",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=pathlib.Path,
+        help="path to the folder that will contain processed images",
+    )
+    parser.add_argument(
+        "--no-move-images",
+        action="store_true",
+        help="don't move image files to the output directory (image files are moved by default)",
+    )
+    result = parser.parse_args(args)
+    return result
+
+
+def norm_args(args: list[str] = None):
+    parsed_args = build_args(args)
+
+    tesseract_install_dir_reg = tesseract.get_tesseract_install_dir_win_reg()
+    tesseract_install_dir_guess = tesseract.get_tesseract_install_dir_win_guess()
+
+    downloads_dir_reg = files.get_user_downloads_dir_win_guess()
+    downloads_dir_guess = files.get_user_downloads_dir_win_reg()
+
+    documents_dir_guess = files.get_user_documents_dir_win_guess()
+
+    # Tesseract exe
+    tesseract_exe = parsed_args.tesseract_exe
+    if not tesseract_exe:
+        tesseract_exe = tesseract.get_tesseract_executable_win_guess(
+            tesseract_install_dir_reg
+        )
+        if not tesseract_exe:
+            tesseract_exe = tesseract.get_tesseract_executable_win_guess(
+                tesseract_install_dir_guess
+            )
+
+    # Tesseract tessdata
+    tesseract_data = parsed_args.tesseract_data
+    if not tesseract_data:
+        tesseract_data = tesseract.get_tesseract_data_dir_win_guess(
+            tesseract_install_dir_reg
+        )
+        if not tesseract_data:
+            tesseract_data = tesseract.get_tesseract_data_dir_win_guess(
+                tesseract_install_dir_guess
+            )
+
+    # input dir
+    input_dir = parsed_args.input_dir
+    if not input_dir:
+        input_dir = downloads_dir_reg
+        if not input_dir:
+            input_dir = downloads_dir_guess
+
+    # output dir
+    output_dir = parsed_args.output_dir
+    if not output_dir:
+        output_dir = documents_dir_guess / "Tesseract"
+
+    logger.info(f"Using Tesseract executable: '{tesseract_exe}'.")
+    logger.info(f"Using Tesseract data: '{tesseract_data}'.")
+    logger.info(f"Using input directory: '{input_dir}'.")
+    logger.info(f"Using output directory: '{output_dir}'.")
+
+    return {
+        "tesseract_exe": tesseract_exe,
+        "tesseract_data": tesseract_data,
+        "input_dir": input_dir,
+        "output_dir": output_dir,
+        "no_move_images": parsed_args.no_move_images,
+    }
+
+
+def get_image_text(
+    exe_path: pathlib.Path,
+    data_dir: pathlib.Path,
+    image_dir: pathlib.Path,
+) -> typing.Tuple[pathlib.Path, str]:
+    for image_file in files.find_ff_screenshot_files(image_dir):
+        output_text = tesseract.run_tesseract(exe_path, data_dir, image_file)
+        yield image_file, output_text
+
+
+def run_program(args: list[str] = None) -> None:
+    logger.info("Starting Screenshot OCR...")
+
+    # get the arguments
+    normalised_arguments = norm_args(args)
+    tesseract_exe = normalised_arguments["tesseract_exe"]
+    tesseract_data = normalised_arguments["tesseract_data"]
+    input_dir = normalised_arguments["input_dir"]
+    output_dir = normalised_arguments["output_dir"]
+    move_images = not normalised_arguments["no_move_images"]
+
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    count = 0
+
+    # find the image files and extract the text from each
+    for image_file, output_text in get_image_text(
+        tesseract_exe, tesseract_data, input_dir
+    ):
+        if move_images:
+            # move the image file to the output dir
+            shutil.move(image_file, output_dir / image_file.name)
+
+        # create a text file with the same name as the image file that contains the extracted text
+        (output_dir / image_file.stem).with_suffix(".txt").write_text(output_text)
+
+        # print the image file name and extracted text to stdout
+        logger.info(f"{image_file.name}: {output_text}")
+
+        count += 1
+
+    logger.info(f"Found and processed {count} image file(s).")
+    logger.info("...finished.")
diff --git a/screenshot_ocr/files.py b/screenshot_ocr/files.py
@@ -0,0 +1,90 @@
+import logging
+import pathlib
+import sys
+
+from screenshot_ocr import utils
+
+logger = logging.getLogger(__name__)
+
+
+def get_user_downloads_dir_win_guess():
+    if sys.platform != "win32":
+        logger.debug("Cannot use Windows default path on non-Windows platform.")
+        return None
+
+    import os
+
+    env_var = os.environ.get("USERPROFILE")
+    if not env_var or not env_var.strip():
+        logger.debug("The Windows current user profile path %USERPROFILE% is not set.")
+        return None
+
+    return utils.guess_path(pathlib.Path(env_var), "Downloads", "user downloads")
+
+
+def get_user_downloads_dir_win_reg():
+    if sys.platform != "win32":
+        logger.debug("Cannot use Windows registry on non-Windows platform.")
+        return None
+
+    import winreg
+
+    tree_root = winreg.HKEY_CURRENT_USER
+    tree_leaf = winreg.OpenKeyEx(
+        tree_root,
+        r"SOFTWARE\\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders\\",
+    )
+    key_value, key_type = winreg.QueryValueEx(
+        tree_leaf, "{374DE290-123F-4565-9164-39C4925E467B}"
+    )
+    if tree_leaf:
+        winreg.CloseKey(tree_leaf)
+
+    if key_value and key_type == winreg.REG_SZ:
+        logger.debug(
+            f"Found user downloads directory from Windows registry: '{key_value}'."
+        )
+        return pathlib.Path(key_value)
+
+    logger.debug("Could not find user downloads directory in Windows registry.")
+    return None
+
+
+def get_user_documents_dir_win_guess():
+    if sys.platform != "win32":
+        logger.debug("Cannot use Windows default path on non-Windows platform.")
+        return None
+
+    import os
+
+    env_var = os.environ.get("USERPROFILE")
+    if not env_var or not env_var.strip():
+        logger.debug("The Windows current user profile path %USERPROFILE% is not set.")
+        return None
+
+    return utils.guess_path(pathlib.Path(env_var), "Documents", "user documents")
+
+
+def find_ff_screenshot_files(image_dir: pathlib.Path):
+    """Yield the FireFox screenshot files."""
+    logger.info(
+        f"Looking for files in '{image_dir}' "
+        "that match the pattern 'Screenshot [date] Facebook.png'."
+    )
+    for file_path in image_dir.iterdir():
+        if not file_path.is_file():
+            continue
+        if file_path.suffix != ".png":
+            continue
+        if not file_path.stem.startswith("Screenshot "):
+            continue
+        if not file_path.stem.endswith("Facebook"):
+            continue
+
+        yield file_path
+
+
+def arrange_image_file_text(
+    input_file: pathlib.Path, output_dir: pathlib.Path, image_text: str
+) -> None:
+    pass
diff --git a/screenshot_ocr/main.py b/screenshot_ocr/main.py
@@ -0,0 +1,16 @@
+import logging
+
+
+def run():
+    logging.basicConfig(
+        format="%(asctime)s [%(levelname)-8s] %(message)s",
+        level=logging.DEBUG,
+    )
+
+    from screenshot_ocr import cli
+
+    cli.run_program()
+
+
+if __name__ == "__main__":
+    run()
-Original file line number
+Diff line change
@@ Expand Up / @@ -127,3 +127,6 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    # idea
+    .idea/
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,3 @@
		# screenshot-ocr

		Extract text from screenshots.