diff --git a/.cruft.json b/.cruft.json new file mode 100644 index 0000000..4b2fcbc --- /dev/null +++ b/.cruft.json @@ -0,0 +1,23 @@ +{ + "template": "https://github.com/CybercentreCanada/assemblyline-service-template.git", + "commit": "2f961bb22a68cd997efef36a7f41bbbe19b3dcf8", + "checkout": null, + "context": { + "cookiecutter": { + "service_name": "document-preview", + "__svc_name": "document-preview", + "__repository": "assemblyline-service-document-preview", + "__pkg_name": "document_preview", + "__class_name": "DocumentPreview", + "short_description": "This Assemblyline service renders documents for preview and performs OCR analysis for malicious content.", + "short_description_fr": "Ce service d'Assemblyline exécute le rendement des documents pour prévisualisation et effectue une analyse OCR pour détecter les contenus malveillants.", + "stage": "CORE", + "category": "Static Analysis", + "org_name_full": "CybercentreCanada", + "org_name_short": "cccs", + "license": "mit", + "_template": "https://github.com/CybercentreCanada/assemblyline-service-template.git" + } + }, + "directory": null +} diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..5f78d7b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,32 @@ +Dockerfile +.idea +.git +.gitignore +.vscode +.dockerignore + +pipelines +venv +.venv +env +.env +test +tests +examples +docs + +build +dist +**/__pycache__ +**/*.pyc + +pip-log.txt +pip-delete-this-directory.txt +.tox +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.log diff --git a/.gitignore b/.gitignore index 0d20b64..78f6696 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,216 @@ -*.pyc +# Created by https://www.toptal.com/developers/gitignore/api/python,vim,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=python,vim,visualstudiocode + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +*~ +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/python,vim,visualstudiocode diff --git a/.vscode/launch.json b/.vscode/launch.json index f505dfb..b667e5f 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,7 +13,7 @@ "args": [ "-d", "document_preview.document_preview.DocumentPreview", - "/path/to/sample" + "${file}" ], "justMyCode": false, }, diff --git a/.vscode/settings.json b/.vscode/settings.json index 1d6ab98..54dc624 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,6 @@ { "editor.codeActionsOnSave": { - "source.organizeImports": true, + "source.organizeImports": "explicit" }, "editor.formatOnSave": true, "editor.rulers": [ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..b9785be --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,69 @@ +# Assemblyline contributing guide + +This guide covers the basics of how to contribute to the Assemblyline project. + +Python code should follow the PEP8 guidelines defined here: +[PEP8 Guidelines](https://www.python.org/dev/peps/pep-0008/). + +## Tell us want you want to build/fix + +Before you start coding anything you should connect with the Assemblyline community via the +[Assemblyline Discord server](https://discord.gg/GUAy9wErNu) and/or the +[central Assemblyline GitHub project](https://github.com/CybercentreCanada/assemblyline/issues) to make sure no one +else is working on the same thing and that whatever you are going to build still fits with the vision of the system. + +## Git workflow + +- Clone the repo to your own account +- Checkout and pull the latest commits from the master branch +- Make a branch +- Work on your modifications and make sure your changes work as expected +- When you're satisfied with your changes, create a pull requests to the Assemblyline repo + +#### Transfer your service repo + +If you've worked on a new service that you want to be included in the default service selection you'll have to transfer +the associated repo into our control. + +#### You are not allow to merge: + +Even if you try to merge in your pull request, you will be denied. Only a few people in our team are allowed to merge +code into our repositories. + +We check for new pull requests every day and will merge them in once they have been approved by someone in our team. + +# Guide de contribution d'Assemblyline + +Ce guide couvre les bases de la façon de contribuer au projet Assemblyline. + +Le code Python doit suivre les directives PEP8 définies ici: +[Directives PEP8](https://www.python.org/dev/peps/pep-0008/). + +## Dites-nous que vous voulez construire / réparer + +Avant de commencer à coder quoi que ce soit, vous devriez vous connecter à la communauté Assemblyline via le +[Serveur Discord Assemblyline](https://discord.gg/GUAy9wErNu) et/ou le +[projet GitHub central Assemblyline](https://github.com/CybercentreCanada/assemblyline/issues) pour vous assurer que +personne d'autre ne travaille sur la même chose et que tout ce que vous allez construire correspond toujours à la vision +du système. + +## Flux de travail avec Git + +- Clonez le référentiel sur votre propre compte +- Changez de branche pour la branche principale et la synchroniser avec le serveur de référence +- Faire une nouvelle branche +- Travaillez sur ce que vous souhaitez et assurez-vous que vos modifications fonctionnent comme prévu +- Lorsque vous êtes satisfait de vos modifications, créez une demande de fusion sur le référentiel d'Assemblyline + +#### Transférer votre référentiel de service + +Si vous avez travaillé sur un nouveau service que vous souhaitez inclure dans la sélection de service par défaut, vous +devrez transférer le référentiel associé sous notre contrôle. + +#### Vous n'êtes pas autorisé à compléter une fusion: + +Même si vous tentez de compléter une demande de fusion, vous serez refusé. Seules quelques personnes de notre équipe +sont autorisées à fusionner dans nos référentiels. + +Nous vérifions les nouvelles demande de fusion tous les jours et les fusionnerons une fois qu'elles auront été approuvées +par quelqu'un de notre équipe. diff --git a/Dockerfile b/Dockerfile index a65d4c6..1cd0efc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,43 +1,55 @@ -ARG branch=latest -FROM cccs/assemblyline-v4-service-base:$branch - -ENV SERVICE_PATH document_preview.document_preview.DocumentPreview - -USER root - -RUN apt-get update && apt-get install -y wget libreoffice unzip && apt-get install --no-install-recommends -y calibre - -RUN mkdir -p /usr/share/man/man1mkdir -p /usr/share/man/man1 -RUN apt-get install -y tesseract-ocr libemail-outlook-message-perl libgdiplus unzip -RUN apt-get install -y poppler-utils wkhtmltopdf -RUN pip install Pillow==9.5.0 natsort imgkit compoundfiles compressed_rtf pytesseract selenium unoconv multidecoder XlsxWriter pandas - -WORKDIR /tmp - -# Find out what is the latest version of the chrome-for-testing/chromedriver available -RUN VERS=$(wget -q -O - https://googlechromelabs.github.io/chrome-for-testing/LATEST_RELEASE_STABLE) && \ - # Download + Install google-chrome with the version matching the latest chromedriver - wget -O ./google-chrome-stable_amd64.deb https://dl.google.com/linux/chrome/deb/pool/main/g/google-chrome-stable/google-chrome-stable_$VERS-1_amd64.deb && \ - apt install -y ./google-chrome-stable_amd64.deb && \ - # Download + unzip the latest chromedriver - wget -O ./chromedriver-linux64.zip https://storage.googleapis.com/chrome-for-testing-public/$VERS/linux64/chromedriver-linux64.zip && \ - unzip ./chromedriver-linux64.zip chromedriver-linux64/chromedriver && \ - rm -f ./google-chrome-stable_current_amd64.deb ./chromedriver-linux64.zip && \ - mv ./chromedriver-linux64/chromedriver /usr/bin/chromedriver && \ - # Cleanup - rm -rf /tmp/* - -# Switch to assemblyline user -USER assemblyline - -# Copy DocPreview service code -WORKDIR /opt/al_service -COPY . . - -ARG version=4.0.0.dev1 -USER root -RUN sed -i -e "s/\$SERVICE_TAG/$version/g" service_manifest.yml -# Add uno package to PYTHONPATH -ENV PYTHONPATH $PYTHONPATH:/usr/lib/python3/dist-packages/ - -USER assemblyline +ARG branch=latest +FROM cccs/assemblyline-v4-service-base:$branch + +# Python path to the service class from your service directory +ENV SERVICE_PATH document_preview.document_preview.DocumentPreview + +# Install apt dependencies +USER root + +COPY pkglist.txt /tmp/setup/ +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y --no-install-recommends \ + $(grep -vE "^\s*(#|$)" /tmp/setup/pkglist.txt | tr "\n" " ") && \ + rm -f /tmp/setup/pkglist.txt + +WORKDIR /tmp + +# Find out what is the latest version of the chrome-for-testing/chromedriver available +RUN VERS=$(wget -q -O - https://googlechromelabs.github.io/chrome-for-testing/LATEST_RELEASE_STABLE) && \ + # Download + Install google-chrome with the version matching the latest chromedriver + wget -O ./google-chrome-stable_amd64.deb https://dl.google.com/linux/chrome/deb/pool/main/g/google-chrome-stable/google-chrome-stable_$VERS-1_amd64.deb && \ + apt install -y ./google-chrome-stable_amd64.deb && \ + # Download + unzip the latest chromedriver + wget -O ./chromedriver-linux64.zip https://storage.googleapis.com/chrome-for-testing-public/$VERS/linux64/chromedriver-linux64.zip && \ + unzip ./chromedriver-linux64.zip chromedriver-linux64/chromedriver && \ + rm -f ./google-chrome-stable_current_amd64.deb ./chromedriver-linux64.zip && \ + mv ./chromedriver-linux64/chromedriver /usr/bin/chromedriver && \ + # Cleanup + rm -rf /tmp/* + +RUN rm -rf /var/lib/apt/lists/* + +# Install python dependencies +USER assemblyline +COPY requirements.txt requirements.txt +RUN pip install \ + --no-cache-dir \ + --user \ + --requirement requirements.txt && \ + rm -rf ~/.cache/pip + +# Copy service code +WORKDIR /opt/al_service +COPY . . + +# Patch version in manifest +ARG version=1.0.0.dev1 +USER root +RUN sed -i -e "s/\$SERVICE_TAG/$version/g" service_manifest.yml +# Add uno package to PYTHONPATH +ENV PYTHONPATH $PYTHONPATH:/usr/lib/python3/dist-packages/ + +# Switch to assemblyline user +USER assemblyline diff --git a/LICENSE b/LICENSE index 99698e2..0de2b63 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,11 @@ MIT License -Copyright (c) 2021 x1mus +Copyright (c) 2024 Crown Copyright, Government of Canada +(Canadian Centre for Cyber Security / Communications Security Establishment) + +Copyright title to all 3rd party software distributed with Assemblyline (AL) +is held by the respective copyright holders as noted in those files. Users +are asked to read the 3rd Party Licenses referenced with those assets. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -19,3 +24,4 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d584767 --- /dev/null +++ b/Makefile @@ -0,0 +1,26 @@ +ifndef VERSION +$(error VERSION is undefined) +endif + +TAG?=latest +ORG?=cccs + +ifneq ($(ORG)x, x) +ORG:=$(ORG)/ +endif +ifneq ($(REGISTRY)x, x) +ORG:=$(REGISTRY)/ +endif + +.PHONY: default +default: build + +.PHONY: build +build: + docker build \ + --pull \ + --build-arg version=$(VERSION) \ + --build-arg branch=stable \ + -t $(REGISTRY)$(ORG)assemblyline-service-document-preview:$(TAG)\ + -f ./Dockerfile \ + . diff --git a/README.md b/README.md index c236966..33d4a91 100644 --- a/README.md +++ b/README.md @@ -1,47 +1,104 @@ -# Document preview service -This repository is a self-developed Assemblyline service based on a [FAME's module](https://github.com/certsocietegenerale/fame_modules/tree/master/processing/document_preview). -It was created by [x1mus](https://github.com/x1mus) with support from [Sorakurai](https://github.com/Sorakurai) and [reynas](https://github.com/reynas) at [NVISO](https://github.com/NVISOsecurity). - -This also contains modified source code from the following repositories: -- [XME's emlrender](https://github.com/xme/emlrender) -- [JoshData's convert-outlook-msg-file](https://github.com/JoshData/convert-outlook-msg-file) - -## OCR Configuration -In this service, you're allowed to override the default OCR terms from the [service base](https://github.com/CybercentreCanada/assemblyline-v4-service/blob/master/assemblyline_v4_service/common/ocr.py) using `ocr` key in the `config` block of the service manifest. - -### Simple Term Override (Legacy) -Let's say, I want to use a custom set of terms for `ransomware` detection. Then I can set the following: - -```yaml -config: - ocr: - ransomware: ['bad1', 'bad2', ...] -``` - -This will cause the service to **only** use the terms I've specified when looking for `ransomware` terms. This is still subject to the hit threshold defined in the service base. - -### Advanced Term Override -Let's say, I want to use a custom set of terms for `ransomware` detection and I want to set the hit threshold to `1` instead of `2` (default). Then I can set the following: - -```yaml -config: - ocr: - ransomware: - terms: ['bad1', 'bad2', ...] - threshold: 1 -``` - -This will cause the service to **only** use the terms I've specified when looking for `ransomware` terms and is subject to the hit threshold I've defined. - -### Term Inclusion/Exclusion -Let's say, I want to add/remove a set of terms from the default set for `ransomware` detection. Then I can set the following: - -```yaml -config: - ocr: - ransomware: - include: ['bad1', 'bad2', ...] - exclude: ['bank account'] -``` - -This will cause the service to add the terms listed in `include` and remove the terms in `exclude` when looking for `ransomware` terms in OCR detection with the default set. +[![Discord](https://img.shields.io/badge/chat-on%20discord-7289da.svg?sanitize=true)](https://discord.gg/GUAy9wErNu) +[![](https://img.shields.io/discord/908084610158714900)](https://discord.gg/GUAy9wErNu) +[![Static Badge](https://img.shields.io/badge/github-assemblyline-blue?logo=github)](https://github.com/CybercentreCanada/assemblyline) +[![Static Badge](https://img.shields.io/badge/github-assemblyline\_service\_document\_preview-blue?logo=github)](https://github.com/CybercentreCanada/assemblyline-service-document-preview) +[![GitHub Issues or Pull Requests by label](https://img.shields.io/github/issues/CybercentreCanada/assemblyline/service-document-preview)](https://github.com/CybercentreCanada/assemblyline/issues?q=is:issue+is:open+label:service-document-preview) +[![License](https://img.shields.io/github/license/CybercentreCanada/assemblyline-service-document-preview)](./LICENSE) +# DocumentPreview Service + +This Assemblyline service renders documents for preview and performs OCR analysis for malicious content. + +## Service Details + +### OCR +This uses OCR for it's analysis, you can find information about OCR configurations [here](https://cybercentrecanada.github.io/assemblyline4_docs/administration/service_management/#ocr-configuration). + +## Accreditation / Contributions +This Assemblyline service is based on [FAME's module](https://github.com/certsocietegenerale/fame_modules/tree/master/processing/document_preview). +It was originally created by [x1mus](https://github.com/x1mus) with support from [Sorakurai](https://github.com/Sorakurai) and [reynas](https://github.com/reynas) at [NVISO](https://github.com/NVISOsecurity). + +This also contains modified source code from the following repositories: +- [XME's emlrender](https://github.com/xme/emlrender) +- [JoshData's convert-outlook-msg-file](https://github.com/JoshData/convert-outlook-msg-file) + +## Image variants and tags + +Assemblyline services are built from the [Assemblyline service base image](https://hub.docker.com/r/cccs/assemblyline-v4-service-base), +which is based on Debian 11 with Python 3.11. + +Assemblyline services use the following tag definitions: + +| **Tag Type** | **Description** | **Example Tag** | +| :----------: | :----------------------------------------------------------------------------------------------- | :------------------------: | +| latest | The most recent build (can be unstable). | `latest` | +| build_type | The type of build used. `dev` is the latest unstable build. `stable` is the latest stable build. | `stable` or `dev` | +| series | Complete build details, including version and build type: `version.buildType`. | `4.5.stable`, `4.5.1.dev3` | + +## Running this service + +This is an Assemblyline service. It is designed to run as part of the Assemblyline framework. + +If you would like to test this service locally, you can run the Docker image directly from the a shell: + + docker run \ + --name DocumentPreview \ + --env SERVICE_API_HOST=http://`ip addr show docker0 | grep "inet " | awk '{print $2}' | cut -f1 -d"/"`:5003 \ + --network=host \ + cccs/assemblyline-service-document-preview + +To add this service to your Assemblyline deployment, follow this +[guide](https://cybercentrecanada.github.io/assemblyline4_docs/developer_manual/services/run_your_service/#add-the-container-to-your-deployment). + +## Documentation + +General Assemblyline documentation can be found at: https://cybercentrecanada.github.io/assemblyline4_docs/ + +# Service DocumentPreview + +Ce service d'Assemblyline exécute le rendement des documents pour prévisualisation et effectue une analyse OCR pour détecter les contenus malveillants. + + +## Détails du service + +### OCR +Ce service utilise l'OCR pour son analyse. Vous pouvez trouver les détails de configurations de l'OCR [ici] (https://cybercentrecanada.github.io/assemblyline4_docs/administration/service_management/#ocr-configuration). + +## Accréditation / Contributions +Ce service Assemblyline est basé sur le module [FAME] (https://github.com/certsocietegenerale/fame_modules/tree/master/processing/document_preview). +Il a été créé à l'origine par [x1mus](https://github.com/x1mus) avec le soutien de [Sorakurai](https://github.com/Sorakurai) et [reynas](https://github.com/reynas) à [NVISO](https://github.com/NVISOsecurity). + +Il contient également du code source modifié provenant des dépôts suivants : +- [emlrender de XME](https://github.com/xme/emlrender) +- [convert-outlook-msg-file de JoshData](https://github.com/JoshData/convert-outlook-msg-file) + +## Variantes et étiquettes d'image + +Les services d'Assemblyline sont construits à partir de l'image de base [Assemblyline service](https://hub.docker.com/r/cccs/assemblyline-v4-service-base), +qui est basée sur Debian 11 avec Python 3.11. + +Les services d'Assemblyline utilisent les définitions d'étiquettes suivantes: + +| **Type d'étiquette** | **Description** | **Exemple d'étiquette** | +| :------------------: | :------------------------------------------------------------------------------------------------------------- | :------------------------: | +| dernière version | La version la plus récente (peut être instable). | `latest` | +| build_type | Type de construction utilisé. `dev` est la dernière version instable. `stable` est la dernière version stable. | `stable` ou `dev` | +| série | Détails de construction complets, comprenant la version et le type de build: `version.buildType`. | `4.5.stable`, `4.5.1.dev3` | + +## Exécution de ce service + +Ce service est spécialement optimisé pour fonctionner dans le cadre d'un déploiement d'Assemblyline. + +Si vous souhaitez tester ce service localement, vous pouvez exécuter l'image Docker directement à partir d'un terminal: + + docker run \ + --name DocumentPreview \ + --env SERVICE_API_HOST=http://`ip addr show docker0 | grep "inet " | awk '{print $2}' | cut -f1 -d"/"`:5003 \ + --network=host \ + cccs/assemblyline-service-document-preview + +Pour ajouter ce service à votre déploiement d'Assemblyline, suivez ceci +[guide](https://cybercentrecanada.github.io/assemblyline4_docs/fr/developer_manual/services/run_your_service/#add-the-container-to-your-deployment). + +## Documentation + +La documentation générale sur Assemblyline peut être consultée à l'adresse suivante: https://cybercentrecanada.github.io/assemblyline4_docs/ diff --git a/document_preview/__init__.py b/document_preview/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/document_preview/document_preview.py b/document_preview/document_preview.py index b467fb2..b06ec4a 100644 --- a/document_preview/document_preview.py +++ b/document_preview/document_preview.py @@ -52,11 +52,10 @@ def convert_from_path(fp: str, output_directory: str, first_page=1, last_page=No class DocumentPreview(ServiceBase): def __init__(self, config=None): super(DocumentPreview, self).__init__(config) - browser_options = ChromeOptions() # Set brower options depending on service configuration - browser_cfg = config.get("browser_options", {}) + browser_cfg = self.config.get("browser_options", {}) [browser_options.add_argument(arg) for arg in browser_cfg.get("arguments", [])] [browser_options.set_capability(cap_n, cap_v) for cap_n, cap_v in browser_cfg.get("capabilities", {}).items()] diff --git a/pipelines/.cruft.json b/pipelines/.cruft.json new file mode 100644 index 0000000..12eab21 --- /dev/null +++ b/pipelines/.cruft.json @@ -0,0 +1,21 @@ +{ + "template": "https://github.com/CybercentreCanada/assemblyline-service-pipeline-generator.git", + "commit": "eb41868283c10b1a31746ebfd29b2c0e32c26025", + "checkout": null, + "context": { + "cookiecutter": { + "__directory_name": "pipelines", + "classification": "UNCLASSIFIED", + "is_public": true, + "test_in_container": true, + "test_versions": "default", + "timeout": "default", + "__setupscript": "setup.bash", + "_extensions": [ + "local_extensions.extract_versions" + ], + "_template": "https://github.com/CybercentreCanada/assemblyline-service-pipeline-generator.git" + } + }, + "directory": null +} diff --git a/pipelines/.gitignore b/pipelines/.gitignore new file mode 100644 index 0000000..21a119d --- /dev/null +++ b/pipelines/.gitignore @@ -0,0 +1 @@ +setup.bash diff --git a/pipelines/azure-build.yaml b/pipelines/azure-build.yaml index bbbc6c0..a56c6eb 100644 --- a/pipelines/azure-build.yaml +++ b/pipelines/azure-build.yaml @@ -2,23 +2,6 @@ name: build variables: - group: unittest-samples - - name: self_location - value: "self_location" - - name: full_self_location - value: "$(Agent.BuildDirectory)/$(self_location)" - - name: samples_location - value: "samples_location" - - name: full_samples_location - value: "$(Agent.BuildDirectory)/$(samples_location)" - -resources: - repositories: - - repository: unittest-samples - type: github - name: $(unittest_samples_repository) - ref: main - endpoint: github-repo-sa - trigger: none trigger: tags: @@ -28,54 +11,26 @@ pr: none pool: vmImage: "ubuntu-20.04" -stages: - - stage: deploy - jobs: - - job: deploy - displayName: Deploy containers to dockerhub - variables: - - group: deployment-information - steps: - - task: Docker@2 - displayName: Login to docker hub - inputs: - command: login - containerRegistry: dockerhub - - task: Docker@2 - displayName: Login to chimera - inputs: - command: login - containerRegistry: CHIMERA-U-ACR - - checkout: self - fetchDepth: 1 - path: $(self_location) - - checkout: unittest-samples - fetchDepth: 1 - path: $(samples_location) - - script: | - export TAG=${BUILD_SOURCEBRANCH#"refs/tags/v"} - if [[ "$TAG" == *stable* ]]; then export BUILD_TYPE=stable; else export BUILD_TYPE=latest; fi - docker build --build-arg version=$TAG --build-arg branch=$BUILD_TYPE -t cccs/${BUILD_REPOSITORY_NAME##*/}:$TAG -t cccs/${BUILD_REPOSITORY_NAME##*/}:$BUILD_TYPE -f ./Dockerfile . - workingDirectory: $(full_self_location) - displayName: Build containers - - script: | - [ ! -d "$(pwd)/tests" ] && echo "No tests found" && exit - export TAG=${BUILD_SOURCEBRANCH#"refs/tags/v"} - if [[ "$TAG" == *stable* ]]; then export BUILD_TYPE=stable; else export BUILD_TYPE=latest; fi - [ -f "$(pwd)/tests/requirements.txt" ] && docker run -e FULL_SELF_LOCATION=/opt/al_service -e FULL_SAMPLES_LOCATION=/opt/samples -v /usr/share/ca-certificates/mozilla:/usr/share/ca-certificates/mozilla -v $(pwd)/tests/:/opt/al_service/tests/ -v ${FULL_SAMPLES_LOCATION}:/opt/samples cccs/${BUILD_REPOSITORY_NAME##*/}:$BUILD_TYPE bash -c 'pip install -U -r tests/requirements.txt; pytest -p no:cacheprovider -vv' && exit - docker run -e FULL_SELF_LOCATION=/opt/al_service -e FULL_SAMPLES_LOCATION=/opt/samples -v /usr/share/ca-certificates/mozilla:/usr/share/ca-certificates/mozilla -v $(pwd)/tests/:/opt/al_service/tests/ -v ${FULL_SAMPLES_LOCATION}:/opt/samples cccs/${BUILD_REPOSITORY_NAME##*/}:$BUILD_TYPE bash -c 'pytest -p no:cacheprovider -vv' - workingDirectory: $(full_self_location) - displayName: Test containers - - script: | - export TAG=${BUILD_SOURCEBRANCH#"refs/tags/v"} - if [[ "$TAG" == *stable* ]]; then export BUILD_TYPE=stable; else export BUILD_TYPE=latest; fi - export SERIES="`expr $TAG : '\([0-9]\+\.[0-9]\+\.\)'`${BUILD_TYPE}" +resources: + repositories: + - repository: PipelineTemplates + type: github + name: CybercentreCanada/assemblyline-pipeline-templates + ref: refs/heads/main + endpoint: github-repo-sa + trigger: none + - repository: unittest-samples + type: github + name: $(unittest_samples_repository) + ref: main + endpoint: github-repo-sa + trigger: none - for IMAGE in "cccs/" "uchimera.azurecr.io/cccs/" - do - docker tag cccs/${BUILD_REPOSITORY_NAME##*/}:$BUILD_TYPE ${IMAGE}${BUILD_REPOSITORY_NAME##*/}:$TAG - docker tag cccs/${BUILD_REPOSITORY_NAME##*/}:$BUILD_TYPE ${IMAGE}${BUILD_REPOSITORY_NAME##*/}:$BUILD_TYPE - docker tag cccs/${BUILD_REPOSITORY_NAME##*/}:$BUILD_TYPE ${IMAGE}${BUILD_REPOSITORY_NAME##*/}:$SERIES - docker push ${IMAGE}${BUILD_REPOSITORY_NAME##*/} --all-tags - done - displayName: Deploy to container repositories +extends: + template: stages/deploy-service.yaml@PipelineTemplates + parameters: + is_public: "true" + samples_repo: unittest-samples + labels: + classification: "UNCLASSIFIED" + \ No newline at end of file diff --git a/pipelines/azure-tests.yaml b/pipelines/azure-tests.yaml new file mode 100644 index 0000000..d984083 --- /dev/null +++ b/pipelines/azure-tests.yaml @@ -0,0 +1,32 @@ +name: tests + +variables: + - group: unittest-samples + +trigger: ["*"] +pr: ["*"] + +pool: + vmImage: "ubuntu-20.04" + +resources: + repositories: + - repository: PipelineTemplates + type: github + name: CybercentreCanada/assemblyline-pipeline-templates + ref: refs/heads/main + endpoint: github-repo-sa + trigger: none + - repository: unittest-samples + type: github + name: $(unittest_samples_repository) + ref: main + endpoint: github-repo-sa + trigger: none + +extends: + template: stages/test-service.yaml@PipelineTemplates + parameters: + samples_repo: unittest-samples + test_container: "true" + \ No newline at end of file diff --git a/pkglist.txt b/pkglist.txt new file mode 100644 index 0000000..27cc4a4 --- /dev/null +++ b/pkglist.txt @@ -0,0 +1,9 @@ +calibre +libemail-outlook-message-perl +libgdiplus +libreoffice +poppler-utils +tesseract-ocr +unzip +wget +wkhtmltopdf diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0cb7c1e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.cruft] +skip = ["pkglist.txt", "README.md", "document_preview", "tests"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..01a683b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +assemblyline +assemblyline-v4-service +compoundfiles +compressed_rtf +imgkit +multidecoder +natsort +pandas +Pillow==9.5.0 +pytesseract +selenium +unoconv +XlsxWriter diff --git a/tests/gentests.py b/tests/gentests.py new file mode 100755 index 0000000..7ae6dde --- /dev/null +++ b/tests/gentests.py @@ -0,0 +1,30 @@ +#!/bin/env python +import os + +from assemblyline.common.importing import load_module_by_path +from assemblyline_service_utilities.testing.helper import TestHelper + +cwd = os.getcwd() +# Force manifest location +os.environ["SERVICE_MANIFEST_PATH"] = os.path.join(cwd, "service_manifest.yml") + +# Setup folder locations +RESULTS_FOLDER = os.path.join(cwd, "tests", "results") +SAMPLES_FOLDER = os.path.join(cwd, "tests", "samples") + +# Find which module we're working on +module = os.environ.get("SERVICE_PATH") +if not module: + for line in open("Dockerfile", "r").readlines(): + if line.startswith("ENV SERVICE_PATH"): + module = line[17:].strip() + break + +# Initialize test helper +service_class = load_module_by_path(module, cwd) +if os.path.exists(SAMPLES_FOLDER): + th = TestHelper(service_class, RESULTS_FOLDER, SAMPLES_FOLDER) +else: + th = TestHelper(service_class, RESULTS_FOLDER) + +th.regenerate_results(save_files=False) diff --git a/tests/gentests.sh b/tests/gentests.sh new file mode 100755 index 0000000..58a159f --- /dev/null +++ b/tests/gentests.sh @@ -0,0 +1,22 @@ +#!/bin/bash +docker build \ + --pull \ + --build-arg branch=stable \ + -t ${PWD##*/}:gentests \ + -f ./Dockerfile \ + . + +if [[ -n "$FULL_SAMPLES_LOCATION" ]]; then + MOUNT_SAMPLES="-v ${FULL_SAMPLES_LOCATION}:/opt/samples" + ENV_SAMPLES="-e FULL_SAMPLES_LOCATION=/opt/samples" +fi +docker run \ + -t\ + --rm \ + -e FULL_SELF_LOCATION=/opt/al_service \ + $ENV_SAMPLES \ + -v /usr/share/ca-certificates/mozilla:/usr/share/ca-certificates/mozilla \ + -v $(pwd)/tests/:/opt/al_service/tests/ \ + $MOUNT_SAMPLES \ + ${PWD##*/}:gentests \ + bash -c "pip install -U -r tests/requirements.txt; python /opt/al_service/tests/gentests.py" diff --git a/tests/pytest.sh b/tests/pytest.sh new file mode 100755 index 0000000..52fc232 --- /dev/null +++ b/tests/pytest.sh @@ -0,0 +1,22 @@ +#!/bin/bash +docker build \ + --pull \ + --build-arg branch=stable \ + -t ${PWD##*/}:pytest \ + -f ./Dockerfile \ + . + +if [[ -n "$FULL_SAMPLES_LOCATION"]]; then + MOUNT_SAMPLES = "-v ${FULL_SAMPLES_LOCATION}:/opt/samples" + ENV_SAMPLES = "-e FULL_SAMPLES_LOCATION=/opt/samples" +fi +docker run \ + -t \ + --rm \ + -e FULL_SELF_LOCATION=/opt/al_service \ + $ENV_SAMPLES \ + -v /usr/share/ca-certificates/mozilla:/usr/share/ca-certificates/mozilla \ + -v $(pwd)/tests/:/opt/al_service/tests/ \ + $MOUNT_SAMPLES \ + ${PWD##*/}:pytest \ + bash -c "pip install -U -r tests/requirements.txt; pytest -p no:cacheprovider --durations=10 -rsx -vv -x" diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 0000000..6e3d947 --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1,3 @@ +assemblyline +assemblyline-service-utilities +pytest diff --git a/tests/results/092267791d9573b097f90d19c6737832f6b0498c98839dd2f4a9e5287a36c78a/result.json b/tests/results/092267791d9573b097f90d19c6737832f6b0498c98839dd2f4a9e5287a36c78a/result.json new file mode 100644 index 0000000..a1e9bc4 --- /dev/null +++ b/tests/results/092267791d9573b097f90d19c6737832f6b0498c98839dd2f4a9e5287a36c78a/result.json @@ -0,0 +1,52 @@ +{ + "extra": { + "drop_file": false, + "score": 0, + "sections": [ + { + "auto_collapse": false, + "body": [ + { + "img": { + "description": "Here's the preview for page 0", + "name": "page_000.png", + "sha256": "0259b9068dffcf2a49353526861d1fb6c4153e0d1467f7e7ab5dce935ee3d0be" + }, + "thumb": { + "description": "Here's the preview for page 0 (thumbnail)", + "name": "page_000.png.thumb", + "sha256": "095ba259701d8d1dd04aafd51bfac29c46e3da6b544d561d146538bff2788f1d" + } + } + ], + "body_config": {}, + "body_format": "IMAGE", + "classification": "TLP:C", + "depth": 0, + "heuristic": null, + "promote_to": "SCREENSHOT", + "tags": {}, + "title_text": "Preview Image(s)", + "zeroize_on_tag_safe": false + } + ] + }, + "files": { + "extracted": [], + "supplementary": [ + { + "name": "page_000.png", + "sha256": "0259b9068dffcf2a49353526861d1fb6c4153e0d1467f7e7ab5dce935ee3d0be" + }, + { + "name": "page_000.png.thumb", + "sha256": "095ba259701d8d1dd04aafd51bfac29c46e3da6b544d561d146538bff2788f1d" + } + ] + }, + "results": { + "heuristics": [], + "tags": {}, + "temp_submission_data": {} + } +} \ No newline at end of file diff --git a/tests/test_document_preview.py b/tests/test_document_preview.py new file mode 100644 index 0000000..a64562e --- /dev/null +++ b/tests/test_document_preview.py @@ -0,0 +1,24 @@ +import os +import time + +import pytest +from assemblyline.common.importing import load_module_by_path +from assemblyline_service_utilities.testing.helper import TestHelper + +# Force manifest location +os.environ["SERVICE_MANIFEST_PATH"] = os.path.join(os.path.dirname(__file__), "..", "service_manifest.yml") + +# Setup folder locations +RESULTS_FOLDER = os.path.join(os.path.dirname(__file__), "results") +SAMPLES_FOLDER = os.path.join(os.path.dirname(__file__), "samples") + +# Initialize test helper +service_class = load_module_by_path("document_preview.document_preview.DocumentPreview", os.path.join(os.path.dirname(__file__), "..")) +th = TestHelper(service_class, RESULTS_FOLDER, SAMPLES_FOLDER) + + +@pytest.mark.parametrize("sample", th.result_list()) +def test_sample(sample): + start_time = time.time() + th.run_test_comparison(sample) + print(f"Time elapsed for {sample}: {round(time.time() - start_time)}s")