diff --git a/.gitignore b/.gitignore index b6e4761..a0d8bfa 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +# idea +.idea/ diff --git a/README.md b/README.md index adb41bd..d628b4b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ # screenshot-ocr + Extract text from screenshots. diff --git a/pyoxidizer.bzl b/pyoxidizer.bzl new file mode 100644 index 0000000..e0c0746 --- /dev/null +++ b/pyoxidizer.bzl @@ -0,0 +1,82 @@ +def make_exe(): + dist = default_python_distribution() + policy = dist.make_python_packaging_policy() + python_config = dist.make_python_interpreter_config() + + # Run a Python module as __main__ when the interpreter starts. + python_config.run_module = "screenshot_ocr.main" + + # Produce a PythonExecutable from a Python distribution, embedded + # resources, and other options. The returned object represents the + # standalone executable that will be built. + exe = dist.to_python_executable( + name="screenshot-ocr", + + # If no argument passed, the default `PythonPackagingPolicy` for the + # distribution is used. + packaging_policy=policy, + + # If no argument passed, the default `PythonInterpreterConfig` is used. + config=python_config, + ) + + # Read Python files from a local directory and add them to our embedded + # context, taking just the resources belonging to the `foo` and `bar` + # Python packages. + exe.add_python_resources(exe.read_package_root( + path=".", + packages=["screenshot_ocr"], + )) + + # Return our `PythonExecutable` instance so it can be built and + # referenced by other consumers of this target. + return exe + +def make_embedded_resources(exe): + return exe.to_embedded_resources() + +def make_install(exe): + # Create an object that represents our installed application file layout. + files = FileManifest() + + # Add the generated executable to our install layout in the root directory. + files.add_python_resource("screenshot-ocr", exe) + + return files + +def make_msi(exe): + # See the full docs for more. But this will convert your Python executable + # into a `WiXMSIBuilder` Starlark type, which will be converted to a Windows + # .msi installer when it is built. + return exe.to_wix_msi_builder( + # Simple identifier of your app. + "screenshot-ocr", + # The name of your application. + "Screenshot OCR", + # The version of your application. + "0.1.0", + # The author/manufacturer of your application. + "Mark C" + ) + + +# Dynamically enable automatic code signing. +def register_code_signers(): + # You will need to run with `pyoxidizer build --var ENABLE_CODE_SIGNING 1` for + # this if block to be evaluated. + if not VARS.get("ENABLE_CODE_SIGNING"): + return + + +# Call our function to set up automatic code signers. +register_code_signers() + +# Tell PyOxidizer about the build targets defined above. +register_target("exe", make_exe) +register_target("resources", make_embedded_resources, depends=["exe"], default_build_script=True) +register_target("install", make_install, depends=["exe"], default=True) +register_target("msi_installer", make_msi, depends=["exe"]) + +# Resolve whatever targets the invoker of this configuration file is requesting +# be resolved. +resolve_targets() diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..c8223ea --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +black +pyoxidizer \ No newline at end of file diff --git a/screenshot_ocr/__init__.py b/screenshot_ocr/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/screenshot_ocr/cli.py b/screenshot_ocr/cli.py new file mode 100644 index 0000000..80c3101 --- /dev/null +++ b/screenshot_ocr/cli.py @@ -0,0 +1,149 @@ +import argparse +import logging +import pathlib +import shutil +import sys +import typing + +from screenshot_ocr import tesseract, files + +logger = logging.getLogger(__name__) + + +def build_args(args: list[str] = None) -> argparse.Namespace: + # prog is set for pyOxidizer, due to issue: https://github.com/indygreg/PyOxidizer/issues/307 + parser = argparse.ArgumentParser( + description="Extract text from screenshots.", prog="screenshot-ocr" + ) + parser.add_argument( + "--tesseract-exe", + type=pathlib.Path, + help="path to the Tesseract executable file", + ) + parser.add_argument( + "--tesseract-data", + type=pathlib.Path, + help="path to the Tesseract data directory", + ) + parser.add_argument( + "--input-dir", + type=pathlib.Path, + help="path to the folder containing the input images", + ) + parser.add_argument( + "--output-dir", + type=pathlib.Path, + help="path to the folder that will contain processed images", + ) + parser.add_argument( + "--no-move-images", + action="store_true", + help="don't move image files to the output directory (image files are moved by default)", + ) + result = parser.parse_args(args) + return result + + +def norm_args(args: list[str] = None): + parsed_args = build_args(args) + + tesseract_install_dir_reg = tesseract.get_tesseract_install_dir_win_reg() + tesseract_install_dir_guess = tesseract.get_tesseract_install_dir_win_guess() + + downloads_dir_reg = files.get_user_downloads_dir_win_guess() + downloads_dir_guess = files.get_user_downloads_dir_win_reg() + + documents_dir_guess = files.get_user_documents_dir_win_guess() + + # Tesseract exe + tesseract_exe = parsed_args.tesseract_exe + if not tesseract_exe: + tesseract_exe = tesseract.get_tesseract_executable_win_guess( + tesseract_install_dir_reg + ) + if not tesseract_exe: + tesseract_exe = tesseract.get_tesseract_executable_win_guess( + tesseract_install_dir_guess + ) + + # Tesseract tessdata + tesseract_data = parsed_args.tesseract_data + if not tesseract_data: + tesseract_data = tesseract.get_tesseract_data_dir_win_guess( + tesseract_install_dir_reg + ) + if not tesseract_data: + tesseract_data = tesseract.get_tesseract_data_dir_win_guess( + tesseract_install_dir_guess + ) + + # input dir + input_dir = parsed_args.input_dir + if not input_dir: + input_dir = downloads_dir_reg + if not input_dir: + input_dir = downloads_dir_guess + + # output dir + output_dir = parsed_args.output_dir + if not output_dir: + output_dir = documents_dir_guess / "Tesseract" + + logger.info(f"Using Tesseract executable: '{tesseract_exe}'.") + logger.info(f"Using Tesseract data: '{tesseract_data}'.") + logger.info(f"Using input directory: '{input_dir}'.") + logger.info(f"Using output directory: '{output_dir}'.") + + return { + "tesseract_exe": tesseract_exe, + "tesseract_data": tesseract_data, + "input_dir": input_dir, + "output_dir": output_dir, + "no_move_images": parsed_args.no_move_images, + } + + +def get_image_text( + exe_path: pathlib.Path, + data_dir: pathlib.Path, + image_dir: pathlib.Path, +) -> typing.Tuple[pathlib.Path, str]: + for image_file in files.find_ff_screenshot_files(image_dir): + output_text = tesseract.run_tesseract(exe_path, data_dir, image_file) + yield image_file, output_text + + +def run_program(args: list[str] = None) -> None: + logger.info("Starting Screenshot OCR...") + + # get the arguments + normalised_arguments = norm_args(args) + tesseract_exe = normalised_arguments["tesseract_exe"] + tesseract_data = normalised_arguments["tesseract_data"] + input_dir = normalised_arguments["input_dir"] + output_dir = normalised_arguments["output_dir"] + move_images = not normalised_arguments["no_move_images"] + + if not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) + + count = 0 + + # find the image files and extract the text from each + for image_file, output_text in get_image_text( + tesseract_exe, tesseract_data, input_dir + ): + if move_images: + # move the image file to the output dir + shutil.move(image_file, output_dir / image_file.name) + + # create a text file with the same name as the image file that contains the extracted text + (output_dir / image_file.stem).with_suffix(".txt").write_text(output_text) + + # print the image file name and extracted text to stdout + logger.info(f"{image_file.name}: {output_text}") + + count += 1 + + logger.info(f"Found and processed {count} image file(s).") + logger.info("...finished.") diff --git a/screenshot_ocr/files.py b/screenshot_ocr/files.py new file mode 100644 index 0000000..93dda15 --- /dev/null +++ b/screenshot_ocr/files.py @@ -0,0 +1,90 @@ +import logging +import pathlib +import sys + +from screenshot_ocr import utils + +logger = logging.getLogger(__name__) + + +def get_user_downloads_dir_win_guess(): + if sys.platform != "win32": + logger.debug("Cannot use Windows default path on non-Windows platform.") + return None + + import os + + env_var = os.environ.get("USERPROFILE") + if not env_var or not env_var.strip(): + logger.debug("The Windows current user profile path %USERPROFILE% is not set.") + return None + + return utils.guess_path(pathlib.Path(env_var), "Downloads", "user downloads") + + +def get_user_downloads_dir_win_reg(): + if sys.platform != "win32": + logger.debug("Cannot use Windows registry on non-Windows platform.") + return None + + import winreg + + tree_root = winreg.HKEY_CURRENT_USER + tree_leaf = winreg.OpenKeyEx( + tree_root, + r"SOFTWARE\\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders\\", + ) + key_value, key_type = winreg.QueryValueEx( + tree_leaf, "{374DE290-123F-4565-9164-39C4925E467B}" + ) + if tree_leaf: + winreg.CloseKey(tree_leaf) + + if key_value and key_type == winreg.REG_SZ: + logger.debug( + f"Found user downloads directory from Windows registry: '{key_value}'." + ) + return pathlib.Path(key_value) + + logger.debug("Could not find user downloads directory in Windows registry.") + return None + + +def get_user_documents_dir_win_guess(): + if sys.platform != "win32": + logger.debug("Cannot use Windows default path on non-Windows platform.") + return None + + import os + + env_var = os.environ.get("USERPROFILE") + if not env_var or not env_var.strip(): + logger.debug("The Windows current user profile path %USERPROFILE% is not set.") + return None + + return utils.guess_path(pathlib.Path(env_var), "Documents", "user documents") + + +def find_ff_screenshot_files(image_dir: pathlib.Path): + """Yield the FireFox screenshot files.""" + logger.info( + f"Looking for files in '{image_dir}' " + "that match the pattern 'Screenshot [date] Facebook.png'." + ) + for file_path in image_dir.iterdir(): + if not file_path.is_file(): + continue + if file_path.suffix != ".png": + continue + if not file_path.stem.startswith("Screenshot "): + continue + if not file_path.stem.endswith("Facebook"): + continue + + yield file_path + + +def arrange_image_file_text( + input_file: pathlib.Path, output_dir: pathlib.Path, image_text: str +) -> None: + pass diff --git a/screenshot_ocr/main.py b/screenshot_ocr/main.py new file mode 100644 index 0000000..cb83afd --- /dev/null +++ b/screenshot_ocr/main.py @@ -0,0 +1,16 @@ +import logging + + +def run(): + logging.basicConfig( + format="%(asctime)s [%(levelname)-8s] %(message)s", + level=logging.DEBUG, + ) + + from screenshot_ocr import cli + + cli.run_program() + + +if __name__ == "__main__": + run() diff --git a/screenshot_ocr/tesseract.py b/screenshot_ocr/tesseract.py new file mode 100644 index 0000000..d85844e --- /dev/null +++ b/screenshot_ocr/tesseract.py @@ -0,0 +1,74 @@ +import logging +import pathlib +import subprocess +import sys +import typing + +from screenshot_ocr import utils + +logger = logging.getLogger(__name__) + + +def get_tesseract_install_dir_win_guess() -> typing.Optional[pathlib.Path]: + if sys.platform != "win32": + logger.debug("Cannot use Windows default path on non-Windows platform.") + return None + + import os + + env_var = os.environ.get("PROGRAMFILES") + if not env_var or not env_var.strip(): + env_var = "C:\\Program Files" + + return utils.guess_path(pathlib.Path(env_var), "Tesseract-OCR", "Tesseract") + + +def get_tesseract_install_dir_win_reg() -> typing.Optional[pathlib.Path]: + if sys.platform != "win32": + logger.debug("Cannot use Windows registry on non-Windows platform.") + return None + + import winreg + + tree_root = winreg.HKEY_LOCAL_MACHINE + tree_leaf = winreg.OpenKeyEx(tree_root, r"SOFTWARE\\Tesseract-OCR\\") + key_value, key_type = winreg.QueryValueEx(tree_leaf, "InstallDir") + if tree_leaf: + winreg.CloseKey(tree_leaf) + + if key_value and key_type == winreg.REG_SZ: + logger.debug( + f"Found Tesseract install directory from Windows registry: '{key_value}'." + ) + return pathlib.Path(key_value) + + logger.debug("Could not find Tesseract install directory in Windows registry.") + return None + + +def get_tesseract_data_dir_win_guess( + install_dir: pathlib.Path, +) -> typing.Optional[pathlib.Path]: + return utils.guess_path(install_dir, "tessdata", "Tesseract data") + + +def get_tesseract_executable_win_guess( + install_dir: pathlib.Path, +) -> typing.Optional[pathlib.Path]: + return utils.guess_path(install_dir, "tesseract.exe", "Tesseract program") + + +def run_tesseract( + exe_path: pathlib.Path, + data_dir: pathlib.Path, + image_file: pathlib.Path, +): + cmds = [ + str(exe_path), + "--tessdata-dir", + str(data_dir), + str(image_file), + "stdout", + ] + result = subprocess.run(cmds, check=True, capture_output=True) + return result.stdout.decode(encoding="UTF-8") diff --git a/screenshot_ocr/utils.py b/screenshot_ocr/utils.py new file mode 100644 index 0000000..9f7433a --- /dev/null +++ b/screenshot_ocr/utils.py @@ -0,0 +1,21 @@ +import logging +import pathlib +import typing + +logger = logging.getLogger(__name__) + + +def guess_path( + path_prefix: pathlib.Path, path_suffix: str, name: str +) -> typing.Optional[pathlib.Path]: + if not path_prefix: + logger.debug(f"Base path for {name} was not provided.") + return None + + expected_path = path_prefix / path_suffix if path_suffix else path_prefix + if not expected_path.exists(): + logger.debug(f"Path for {name} does not exist.") + return None + + logger.debug(f"Found the path for {name}: '{expected_path}'.") + return expected_path