diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..024f98b5 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,5 @@ +[run] +omit = */test_*,*/test,*/test/*,build/*,src/customer/*,/private/*,/tmp/* + +[report] +ignore_errors = True diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..2bb50d25 --- /dev/null +++ b/.flake8 @@ -0,0 +1,9 @@ +[flake8] +ignore = E501,W503,E203,E266,E265,F811 +# E501 line too long +# W503 clang-formatter can produce line break before binary operator +# E203 False positive "whitespaces before :" especially when slicing list, arrays etc. +# E266 Too many leading '#' for block comment +# E265 Block comment should start with '# ' +# F811 A module has been imported twice, but seen when importing functions for fixtures in pytest files. +exclude = **/*_pb2.py diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 00000000..437c010e --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,3 @@ +[settings] +profile=black +known_first_party=qai_hub_models diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..e200a3eb --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,57 @@ +# exclude: +# * third_party folders +# * CloudFormation templates don't pass the yaml-check hook (a known issue) + +exclude: | + (?x)( + /build/ + ) + +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 + hooks: + - id: check-yaml + args: [--allow-multiple-documents] + - id: trailing-whitespace + exclude: '\.diff$' + - id: check-added-large-files + args: ['--maxkb=1024'] + - id: check-merge-conflict + - id: detect-aws-credentials + args: [--allow-missing-credentials] + - id: end-of-file-fixer + exclude: | + (?x)( + \.diff$ + ) + - id: fix-byte-order-marker +- repo: https://github.com/rhysd/actionlint + rev: v1.6.26 + hooks: + - id: actionlint +- repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.9.0.6 + hooks: + - id: shellcheck +- repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--filter-files"] +- repo: https://github.com/psf/black + rev: 22.1.0 + hooks: + - id: black + additional_dependencies: ['click==8.0.4'] +- repo: https://github.com/pycqa/flake8 + rev: 6.1.0 + hooks: + - id: flake8 +- repo: local + hooks: + - id: mypy + name: mypy + files: \.(py|pyi|ipynb|proto|fbs)$ + entry: scripts/util/run_mypy.sh + language: system diff --git a/.shellcheckrc b/.shellcheckrc new file mode 100644 index 00000000..58af7b11 --- /dev/null +++ b/.shellcheckrc @@ -0,0 +1,33 @@ +## How to update these rules +## +## From repo root, `pre-commit run -a shellcheck` to run shellcheck across all files in the repo. By default, the commit hook +## will only run on your patch. However, CI runs it across all files, so if you're going to touch this file, I +## recommend forcing it to run. +## +## To run on and fix a specific file, run: +## shellcheck -x -f diff {filename} | apply +## + +external-sources=true + +## Issue 7533: [shellcheck] Explore miscellaneous checks +disable=SC2003 # expr is antiquated. Consider rewriting this using `$((..))`, `${}` or `[[ ]]`. +disable=SC2005 # Useless `echo`? Instead of `echo $(cmd)`, just use `cmd` +disable=SC2012 # Use `find` instead of `ls` to better handle non-alphanumeric filenames. +disable=SC2116 # Useless echo? Instead of `cmd $(echo foo)`, just use `cmd foo`. +disable=SC2155 # Declare and assign separately to avoid masking return values. +disable=SC2166 # Prefer `[ p ] && [ q ]` as `[ p -a q ]` is not well defined. + + +## things we probably don't care about ever fixing. + +disable=SC1083 # This `{`/`}` is literal. Check if `;` is missing or quote the expression. + +disable=SC2001 # See if you can use `${variable//search/replace}` instead. +disable=SC2064 # Use single quotes, otherwise this expands now rather than when signalled. +disable=SC2129 # Consider using `{ cmd1; cmd2; } >> file` instead of individual redirects. +disable=SC2143 # Use `grep -q` instead of comparing output with `[ -n .. ]`. +disable=SC2148 # Tips depend on target shell and yours is unknown. Add a shebang. +disable=SC2162 # `read` without `-r` will mangle backslashes. +disable=SC2164 # Use `cd ... || exit` in case `cd` fails. +disable=SC2181 # Check exit code directly with e.g. `if mycmd;`, not indirectly with `$?`. diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..ab9c99ed --- /dev/null +++ b/LICENSE @@ -0,0 +1,11 @@ +Copyright 2024 Qualcomm® Technologies, Inc. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..bac83c81 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,78 @@ +[mypy] +# https://mypy.readthedocs.io/en/stable/config_file.html#import-discovery +exclude = "qai_hub_models/models" + +[mypy-huggingface_hub.*] +ignore_missing_imports = True + +[mypy-onnx.*] +ignore_missing_imports = True + +[mypy-onnxsim.*] +ignore_missing_imports = True + +[mypy-onnxruntime.*] +ignore_missing_imports = True + +[mypy-pytimeparse.*] +ignore_missing_imports = True + +[mypy-skimage.*] +ignore_missing_imports = True + +[mypy-setuptools.*] +ignore_missing_imports = True + +[mypy-tensorflow.*] +ignore_missing_imports = True + +[mypy-torchvision.*] +ignore_missing_imports = True + +[mypy-transformers.*] +ignore_missing_imports = True + +[mypy-tqdm.*] +ignore_missing_imports = True + +[mypy-tap.*] +ignore_missing_imports = True + +[mypy-h5py.*] +ignore_missing_imports = True + +[mypy-flatbuffers.*] +ignore_missing_imports = True + +[mypy-soundfile.*] +ignore_missing_imports = True + +[mypy-datasets.*] +ignore_missing_imports = True + +[mypy-keras.*] +ignore_missing_imports = True + +[mypy-rangefilter.filters.*] +ignore_missing_imports = True + +[mypy-schema.*] +ignore_missing_imports = True + +[mypy-gdown.*] +ignore_missing_imports = True + +[mypy-aimet_torch.*] +ignore_missing_imports = True + +[mypy-boto3.*] +ignore_missing_imports = True + +[mypy-botocore.*] +ignore_missing_imports = True + +[mypy-ruamel.*] +ignore_missing_imports = True + +[mypy-qai_hub_models.models.*] +ignore_errors = true diff --git a/pyrightconfig.json b/pyrightconfig.json new file mode 100644 index 00000000..0bb3304c --- /dev/null +++ b/pyrightconfig.json @@ -0,0 +1,26 @@ +{ + "exclude": [ + "**/.mypy_cache", + "**/.pytest_cache", + "**/__pycache__", + "**/node_modules", + "build/tungsten", + "src/public/staging_python/qai_hub_staging", + "src/tungsten", + "src/www/onnx-optimizer/third_party", + "src/www/onnx-simplifier/third_party", + ], + + "extraPaths": [ + "./build/proto" + ], + + "reportMissingModuleSource": "none", + "reportMissingImports": true, + "reportMissingTypeStubs": false, + "reportShadowedImports": false, + "verboseOutput": false, + + "venvPath": ".", + "venv": "qaism-dev" +} diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..6b6221ac --- /dev/null +++ b/pytest.ini @@ -0,0 +1,16 @@ +[pytest] +testpaths = qai_hub_models +norecursedirs = build +python_files = tests.py test_*.py test.py +filterwarnings = + ignore::DeprecationWarning:coremltools.*: + ignore::DeprecationWarning:torch.*: + ignore::DeprecationWarning:torchvision.*: + ignore::DeprecationWarning:tensorflow.*: + ignore::DeprecationWarning:tensorflow-macos.*: + ignore::DeprecationWarning:tensorflow-metal.*: + ignore::DeprecationWarning:tensorflow-probability.*: +markers = + serial: test must not be run in parallel + slow: marks tests as slow + slow_cloud: marks test as slow and cloud-dependent diff --git a/qai_hub_models/__init__.py b/qai_hub_models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/_version.py b/qai_hub_models/_version.py new file mode 100644 index 00000000..6cd38b74 --- /dev/null +++ b/qai_hub_models/_version.py @@ -0,0 +1 @@ +__version__ = "0.2.7" diff --git a/qai_hub_models/asset_bases.yaml b/qai_hub_models/asset_bases.yaml new file mode 100644 index 00000000..1b110e24 --- /dev/null +++ b/qai_hub_models/asset_bases.yaml @@ -0,0 +1,12 @@ +store_url: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models +web_asset_folder: models/{model_id}/web-assets +static_web_banner_filename: banner.png +animated_web_banner_filename: banner.mp4 +model_asset_folder: models/{model_id}/v{version} +dataset_asset_folder: datasets/{dataset_id}/v{version} +repo_url: https://github.com/quic/ai-hub-models/blob/main +qaihm_repo: qai_hub_models/models/{model_id} +example_use: qai_hub_models/models/{model_id}#example--usage +huggingface_path: qualcomm/{model_name} +models_website_url: https://aihub.qualcomm.com +models_website_relative_path: models/{model_id} diff --git a/qai_hub_models/conftest.py b/qai_hub_models/conftest.py new file mode 100644 index 00000000..8f18d695 --- /dev/null +++ b/qai_hub_models/conftest.py @@ -0,0 +1,4 @@ +def pytest_configure(config): + config.addinivalue_line("markers", "compile: Run compile tests.") + config.addinivalue_line("markers", "profile: Run profile tests.") + config.addinivalue_line("markers", "inference: Run inference tests.") diff --git a/qai_hub_models/datasets/__init__.py b/qai_hub_models/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/datasets/bsd300.py b/qai_hub_models/datasets/bsd300.py new file mode 100644 index 00000000..6112aeb7 --- /dev/null +++ b/qai_hub_models/datasets/bsd300.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import os +from typing import Tuple + +import numpy as np +import torch +from PIL import Image + +from qai_hub_models.datasets.common import BaseDataset +from qai_hub_models.utils.asset_loaders import CachedWebDatasetAsset + +BSD300_URL = ( + "https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/bsds/BSDS300-images.tgz" +) +BSD300_FOLDER_NAME = "BSDS300" +BSD300_VERSION = 1 +BSD300_ASSET = CachedWebDatasetAsset( + BSD300_URL, BSD300_FOLDER_NAME, BSD300_VERSION, "BSDS300.tgz" +) +DATASET_LENGTH = 200 + + +class BSD300Dataset(BaseDataset): + """ + BSD300 published here: https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/bsds/ + """ + + def __init__(self, scaling_factor=4): + self.bsd_path = BSD300_ASSET.path(extracted=True) + self.images_path = os.path.join(self.bsd_path, "images/train") + BaseDataset.__init__(self, self.bsd_path) + self.scaling_factor = scaling_factor + + def _validate_data(self) -> bool: + images_path = os.path.join(self.dataset_path, "images/train") + + # Check image path exists + if not os.path.exists(images_path): + return False + + # Ensure the correct number of images are there + files = os.listdir(images_path) + images = [f for f in files if ".jpg" in f] + if len(images) != DATASET_LENGTH: + return False + + return True + + def _prepare_data(self): + # Rename images to be more friendly to enumeration + directory = os.path.join(self.dataset_path, "images/train") + files = os.listdir(directory) + for i, filename in enumerate(files): + if filename.endswith(".jpg"): + # Open the image and convert it to png + try: + with Image.open(os.path.join(directory, filename)) as img: + img.save(os.path.join(directory, f"img_{i + 1:03d}_HR.jpg")) + # delete the old image + os.remove(os.path.join(directory, filename)) + except ValueError: + print(f"File {filename} does not exist!") + + def __len__(self): + return DATASET_LENGTH + + def __getitem__(self, item) -> Tuple[torch.Tensor, torch.Tensor]: + # We use the super resolution GT-and-test image preparation from AIMET zoo: + # https://github.com/quic/aimet-model-zoo/blob/d09d2b0404d10f71a7640a87e9d5e5257b028802/aimet_zoo_torch/quicksrnet/dataloader/utils.py#L51 + + img = np.asarray( + Image.open(os.path.join(self.images_path, f"img_{item + 1:03d}_HR.jpg")) + ) + height, width = img.shape[0:2] + + # If portrait, transpose to landscape so that all tensors are equal size + if height > width: + img = np.transpose(img, (1, 0, 2)) + height, width = img.shape[0:2] + + # Take the largest possible center-crop of it such that its dimensions are perfectly divisible by the scaling factor + x_remainder = width % ( + 2 * self.scaling_factor + if self.scaling_factor == 1.5 + else self.scaling_factor + ) + y_remainder = height % ( + 2 * self.scaling_factor + if self.scaling_factor == 1.5 + else self.scaling_factor + ) + left = int(x_remainder // 2) + top = int(y_remainder // 2) + right = int(left + (width - x_remainder)) + bottom = int(top + (height - y_remainder)) + hr_img = img[top:bottom, left:right] + + hr_height, hr_width = hr_img.shape[0:2] + + hr_img = np.array(hr_img, dtype="uint8") + new_size = (int(width / self.scaling_factor), int(height / self.scaling_factor)) + lr_img = np.asarray(Image.fromarray(hr_img).resize(new_size)) + lr_img = np.clip(lr_img, 0.0, 255.0).astype(np.uint8) + + lr_height, lr_width = lr_img.shape[0:2] + + # Sanity check + assert ( + hr_width == lr_width * self.scaling_factor + and hr_height == lr_height * self.scaling_factor + ) + + lr_img_tensor = torch.from_numpy(lr_img.transpose((2, 0, 1))).contiguous() + lr_img_tensor = lr_img_tensor.to(dtype=torch.float32).div(255) + + hr_img_tensor = torch.from_numpy(hr_img.transpose((2, 0, 1))).contiguous() + hr_img_tensor = hr_img_tensor.to(dtype=torch.float32).div(255) + + return lr_img_tensor, hr_img_tensor + + def _download_data(self) -> None: + BSD300_ASSET.fetch(extract=True) + self._prepare_data() diff --git a/qai_hub_models/datasets/coco.py b/qai_hub_models/datasets/coco.py new file mode 100644 index 00000000..d49dcc3d --- /dev/null +++ b/qai_hub_models/datasets/coco.py @@ -0,0 +1,117 @@ +import os +from typing import Tuple, Union + +import torch +from torch.utils.data.dataloader import default_collate +from torchvision.datasets.coco import CocoDetection + +from qai_hub_models.datasets.common import BaseDataset +from qai_hub_models.utils.asset_loaders import CachedWebDatasetAsset +from qai_hub_models.utils.image_processing import app_to_net_image_inputs + +DATASET_ID = "coco" +DATASET_ASSET_VERSION = 1 +COCO_DATASET = CachedWebDatasetAsset( + "http://images.cocodataset.org/zips/val2017.zip", + DATASET_ID, + DATASET_ASSET_VERSION, + "val2017.zip", +) +COCO_ANNOTATIONS = CachedWebDatasetAsset( + "http://images.cocodataset.org/annotations/annotations_trainval2017.zip", + DATASET_ID, + DATASET_ASSET_VERSION, + "annotations_trainval2017.zip", +) + + +def collate_fn(batch): + try: + image, gt = batch[0][0], batch[0][1] + image_id, height, width, boxes, labels = gt + new_list = [] + new_list.append(default_collate([i for i in image if torch.is_tensor(i)])) + target = ( + torch.tensor(image_id), + torch.tensor(height), + torch.tensor(width), + default_collate([i for i in boxes if torch.is_tensor(i)]), + default_collate([i for i in labels if torch.is_tensor(i)]), + ) + new_list.append(target) + return new_list + except Exception: + return [], ([], [], [], [], []) + + +class CocoDataset(BaseDataset, CocoDetection): + """ + Class for using the COCODetection dataset published here: + + + Contains ~5k images spanning 80 classes. + """ + + def __init__(self, target_image_size: Union[int, Tuple[int, int]] = 640): + BaseDataset.__init__(self, str(COCO_DATASET.path(extracted=True))) + CocoDetection.__init__( + self, + root=COCO_DATASET.path() / "val2017", + annFile=COCO_ANNOTATIONS.path() / "annotations" / "instances_val2017.json", + ) + + categories = self.coco.loadCats(self.coco.getCatIds()) + categories.sort(key=lambda x: x["id"]) + self.label_map = {} + counter = 0 + for c in categories: + self.label_map[c["id"]] = counter + counter += 1 + self.target_image_size = ( + target_image_size + if isinstance(target_image_size, tuple) + else (target_image_size, target_image_size) + ) + + def __getitem__(self, item): + image, target = super(CocoDataset, self).__getitem__(item) + width, height = image.size + boxes = [] + labels = [] + if len(target) == 0: + return None, (None, None) + for annotation in target: + bbox = annotation.get("bbox") + boxes.append( + [ + bbox[0] / width, + bbox[1] / height, + (bbox[0] + bbox[2]) / width, + (bbox[1] + bbox[3]) / height, + ] + ) + labels.append(self.label_map[annotation.get("category_id")]) + boxes = torch.tensor(boxes) + labels = torch.tensor(labels) + image = image.resize(self.target_image_size) + image = app_to_net_image_inputs(image)[1] + return image, (target[0]["image_id"], height, width, boxes, labels) + + def _validate_data(self) -> bool: + # Check validation data exists + if not COCO_DATASET.path().exists(): + return False + + # Check annotations exist + if not COCO_ANNOTATIONS.path().exists(): + return False + + # Ensure there are 5000 samples + if len(os.listdir(COCO_DATASET.path() / "val2017")) < 5000: + return False + + return True + + def _download_data(self) -> None: + COCO_DATASET.fetch(extract=True) + COCO_ANNOTATIONS.fetch(extract=True) diff --git a/qai_hub_models/datasets/common.py b/qai_hub_models/datasets/common.py new file mode 100644 index 00000000..cc99f030 --- /dev/null +++ b/qai_hub_models/datasets/common.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import os +import shutil +from abc import ABC, abstractmethod +from typing import final + +from torch.utils.data import Dataset + + +class BaseDataset(Dataset, ABC): + """ + Base class to be extended by Datasets used in this repo for quantizing models. + """ + + def __init__(self, dataset_path: str): + self.dataset_path = dataset_path + self.download_data() + + @final + def download_data(self) -> None: + if self._validate_data(): + return + if os.path.exists(self.dataset_path): + # Data is corrupted, delete and re-download + if os.path.isdir(self.dataset_path): + shutil.rmtree(self.dataset_path) + else: + os.remove(self.dataset_path) + + self._download_data() + if not self._validate_data(): + raise ValueError("Something went wrong during download.") + + @abstractmethod + def _download_data(self) -> None: + """ + Method to download necessary data to disk. To be implemented by subclass. + """ + pass + + def _validate_data(self) -> bool: + """ + Validates data downloaded on disk. By default just checks that folder exists. + """ + return os.path.exists(self.dataset_path) diff --git a/qai_hub_models/datasets/imagenette.py b/qai_hub_models/datasets/imagenette.py new file mode 100644 index 00000000..a5ae5fd3 --- /dev/null +++ b/qai_hub_models/datasets/imagenette.py @@ -0,0 +1,96 @@ +import os +import stat + +from torchvision.datasets import ImageNet + +from qai_hub_models.datasets.common import BaseDataset +from qai_hub_models.models._shared.imagenet_classifier.app import IMAGENET_TRANSFORM +from qai_hub_models.utils.asset_loaders import CachedWebDatasetAsset + +IMAGENETTE_FOLDER_NAME = "imagenette2-320" +IMAGENETTE_VERSION = 1 +DEVKIT_ASSET = CachedWebDatasetAsset( + "https://image-net.org/data/ILSVRC/2012/ILSVRC2012_devkit_t12.tar.gz", + IMAGENETTE_FOLDER_NAME, + IMAGENETTE_VERSION, + "ILSVRC2012_devkit_t12.tar.gz", +) +IMAGENETTE_ASSET = CachedWebDatasetAsset( + "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-320.tgz", + IMAGENETTE_FOLDER_NAME, + IMAGENETTE_VERSION, + "imagenette2-320.tgz", +) + +# Imagenette data has 10 classes and are labeled 0-9. +# This maps the Imagenette class id to the actual Imagenet_1K class id. +IMAGENETTE_CLASS_MAP = { + 0: 0, + 1: 217, + 2: 482, + 3: 491, + 4: 497, + 5: 566, + 6: 569, + 7: 571, + 8: 574, + 9: 701, +} + + +class ImagenetteDataset(BaseDataset, ImageNet): + """ + Class for using the Imagenette dataset published here: + https://github.com/fastai/imagenette + + Contains ~4k images spanning 10 of the imagenet classes. + """ + + def __init__(self): + self._download_data() + BaseDataset.__init__(self, str(IMAGENETTE_ASSET.path(extracted=True))) + ImageNet.__init__( + self, + root=IMAGENETTE_ASSET.path(), + split="val", + transform=IMAGENET_TRANSFORM, + target_transform=lambda val: IMAGENETTE_CLASS_MAP[val], + ) + + def _validate_data(self) -> bool: + devkit_path = DEVKIT_ASSET.path() + + # Check devkit exists + if not devkit_path.exists(): + return False + + # Check devkit permissions + devkit_permissions = os.stat(devkit_path).st_mode + if devkit_permissions & stat.S_IEXEC != stat.S_IEXEC: + return False + + # Check val data exists + val_data_path = os.path.join(self.dataset_path, "val") + if not os.path.exists(val_data_path): + return False + + # Ensure 10 classes + subdirs = os.listdir(val_data_path) + if len(subdirs) != 10: + return False + + # Ensure >= 300 samples per classes + for subdir in subdirs: + if len(os.listdir(os.path.join(val_data_path, subdir))) < 300: + return False + return True + + def _download_data(self) -> None: + IMAGENETTE_ASSET.fetch(extract=True) + devkit_path = DEVKIT_ASSET.fetch() + devkit_st = os.stat(devkit_path) + os.chmod(devkit_path, devkit_st.st_mode | stat.S_IEXEC) + os.symlink( + DEVKIT_ASSET.path(), + IMAGENETTE_ASSET.path() / os.path.basename(DEVKIT_ASSET.path()), + ) diff --git a/qai_hub_models/evaluators/__init__.py b/qai_hub_models/evaluators/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/evaluators/base_evaluators.py b/qai_hub_models/evaluators/base_evaluators.py new file mode 100644 index 00000000..655a088a --- /dev/null +++ b/qai_hub_models/evaluators/base_evaluators.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Callable, Collection, Tuple, Union + +import torch +from torch.utils.data.dataloader import DataLoader +from tqdm import tqdm +from typing_extensions import TypeAlias + +_ModelIO: TypeAlias = Union[Collection[torch.Tensor], torch.Tensor] +# Typically is a torch DataLoader, but anything with the collection signature is acceptable. +_DataLoader: TypeAlias = Union[ + DataLoader, Collection[Union[_ModelIO, Tuple[_ModelIO, _ModelIO]]] +] + + +class BaseEvaluator(ABC): + """ + Evaluates one or more outputs of a model in comparison to a ground truth. + """ + + @abstractmethod + def add_batch( + self, + output, # torch.Tensor | Collection[torch.Tensor] + ground_truth, # torch.Tensor | Collection[torch.Tensor] + ) -> None: + """ + Add a batch of data to this evaluator. + + Parameters: + output: torch.Tensor | Collection[torch.Tensor] + Torch model output(s) for a single inference. + + If the model forward() function has 1 output, this is a tensor. + If the model forward() function outputs multiple tensors, this is a tuple of tensors. + + gt: torch.Tensor | Collection[torch.Tensor] + The ground truth(s) for this output. + + Some evaluators may accept only a Collection. Others may accept only a tensor. + The meaning of the ground truth is dependent on this method's implementation. + """ + pass + + @abstractmethod + def reset(self) -> None: + """Reset the state of this evaluator.""" + pass + + @abstractmethod + def get_accuracy_score(self) -> float: + """Single float value representing model accuracy. Higher is better.""" + pass + + def add_from_dataset( + self, + model: torch.nn.Module, + data: _DataLoader, + eval_iterations: int | None = None, + device: str = "cpu", + ) -> None: + """ + Populates this evaluator with data from the provided the data loader. + + Parameters: + model: torch.nn.Module + Model to use to compute model outputs. + + data: torch DataLoader | Collection + Data loader for the dataset to use for evaluation. Iterator should return: + tuple(inputs: Collection[torch.Tensor] | torch.Tensor, + ground_truth: Collection[torch.Tensor] | torch.Tensor) + + eval_iterations: int | None + Number of samples to use for evaluation. One sample is one iteration from iter(data). + If none, defaults to the number of samples in the dataset. + + device: str + Name of device on which inference should be run. + """ + + def _add_batch( + _: torch.Tensor, outputs: torch.Tensor, ground_truth: torch.Tensor + ): + self.add_batch(outputs, ground_truth) + + _for_each_batch(model, data, eval_iterations, device, True, _add_batch) + + +def _for_each_batch( + model: torch.nn.Module, + data: _DataLoader, + num_samples: int | None = None, + device: str = "cpu", + data_has_gt: bool = False, + callback: Callable | None = None, +) -> None: + """ + Run the model on each batch of data. + + Parameters: + model: torch.nn.Module + Model to use to compute model outputs. + + data: torch DataLoader | Collection + Data loader for the dataset. Iterator should return: + if data_has_gt: + tuple(inputs: Collection[torch.Tensor] | torch.Tensor, + ground_truth: Collection[torch.Tensor] | torch.Tensor) + else: + Collection[torch.Tensor] | torch.Tensor + + num_samples: int | None + Number of samples to use for evaluation. One sample is one iteration from iter(data). + If none, defaults to the number of samples in the dataset. + + device: str + Name of device on which inference should be run. + + data_has_gt: bool + If true, changes the type this function expects the dataloader to return. See `data` parameter. + + callback: Callable | None + The input, output, and (if provided) ground_truth will be passed to this function after each inference. + """ + torch_device = torch.device(device) + model.eval() + model.to(torch_device) + total_samples = 0 + num_samples = num_samples or len(data) + + if isinstance(data, DataLoader): + batch_size = data.batch_size or 1 + else: + batch_size = 1 + counting_obj = "batches" if batch_size != 1 else "samples" + + with tqdm( + total=batch_size * num_samples, + desc=f"Number of {counting_obj} completed", + ) as pbar: + for sample in data: + + if data_has_gt: + inputs, ground_truth, *_ = sample + else: + inputs, ground_truth = sample, None + + if len(inputs) > 0: + if isinstance(inputs, torch.Tensor): + inputs = inputs.to(torch_device) + outputs = model(inputs) + else: + inputs = [input.to(torch_device) for input in inputs] + outputs = model(*inputs) + + if data_has_gt: + if isinstance(ground_truth, torch.Tensor): + ground_truth = ground_truth.to("cpu") + else: + ground_truth = [gt.to("cpu") for gt in ground_truth] # type: ignore + + if callback: + if data_has_gt: + callback(inputs, outputs, ground_truth) + else: + callback(inputs, outputs) + + total_samples += 1 + pbar.update(batch_size) + if total_samples >= num_samples: + break diff --git a/qai_hub_models/evaluators/classification_evaluator.py b/qai_hub_models/evaluators/classification_evaluator.py new file mode 100644 index 00000000..0081b490 --- /dev/null +++ b/qai_hub_models/evaluators/classification_evaluator.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator + + +class ClassificationEvaluator(BaseEvaluator): + """Evaluator for tracking accuracy of a Classifier Model.""" + + def __init__(self, num_classes: int = 1000): + self.num_classes = num_classes + self.reset() + + def add_batch(self, output: torch.Tensor, gt: int | torch.Tensor): + # This evaluator supports only 1 output tensor at a time. + assert len(output.shape) == 2 and output.shape[-1] == self.num_classes + gt_tensor = torch.Tensor(gt) + assert len(gt_tensor.shape) == 1 and gt_tensor.shape[0] == output.shape[0] + batch_size = output.shape[0] + self.total_samples += batch_size + self.num_correct += sum(torch.argmax(output, dim=-1) == gt_tensor) + + def reset(self): + self.num_correct = 0 + self.total_samples = 0 + + def get_accuracy_score(self) -> float: + if self.total_samples == 0: + return 0 + return self.num_correct / self.total_samples diff --git a/qai_hub_models/evaluators/detection_evaluator.py b/qai_hub_models/evaluators/detection_evaluator.py new file mode 100644 index 00000000..b7cddd92 --- /dev/null +++ b/qai_hub_models/evaluators/detection_evaluator.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from typing import Collection + +import torch +from podm.metrics import ( # type: ignore + BoundingBox, + MetricPerClass, + get_pascal_voc_metrics, +) + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.utils.bounding_box_processing import batched_nms + + +class DetectionEvaluator(BaseEvaluator): + """Evaluator for comparing a batched image output.""" + + def __init__( + self, + image_height: int, + image_width: int, + nms_score_threshold: float = 0.45, + nms_iou_threshold: float = 0.7, + ): + self.reset() + self.nms_score_threshold = nms_score_threshold + self.nms_iou_threshold = nms_iou_threshold + self.scale_x = 1 / image_height + self.scale_y = 1 / image_width + + def add_batch(self, output: Collection[torch.Tensor], gt: Collection[torch.Tensor]): + # This evaluator supports 1 output tensor at a time. + image_id, _, _, bboxes, classes = gt + pred_boxes, pred_scores, pred_class_idx = output + + # Seeing memory issues, initentionally deleting these variables to free memory. + del gt + del output + + # Reuse NMS utility + ( + after_nms_pred_boxes, + after_nms_pred_scores, + after_nms_pred_class_idx, + ) = batched_nms( + self.nms_iou_threshold, + self.nms_score_threshold, + pred_boxes, + pred_scores, + pred_class_idx, + ) + + del pred_boxes + del pred_scores + del pred_class_idx + + # Collect GT and prediction boxes + gt_bb_entry = [ + BoundingBox.of_bbox(image_id, cat, *bbox, 1.0) + for cat, bbox in zip(classes.tolist(), bboxes.tolist()) + ] + del classes + del bboxes + + pd_bb_entry = [ + BoundingBox.of_bbox( + image_id, + pred_cat, + pred_bbox[0] * self.scale_x, + pred_bbox[1] * self.scale_y, + pred_bbox[2] * self.scale_x, + pred_bbox[3] * self.scale_y, + pred_score, + ) + for pred_cat, pred_score, pred_bbox in zip( + after_nms_pred_class_idx[0].tolist(), + after_nms_pred_scores[0].tolist(), + after_nms_pred_boxes[0].tolist(), + ) + ] + + del after_nms_pred_boxes + del after_nms_pred_scores + del after_nms_pred_class_idx + + # Compute mean average precision + self._update_mAP(gt_bb_entry, pd_bb_entry) + + def reset(self): + self.gt_bb = [] + self.pd_bb = [] + self.results = {} + + def _update_mAP(self, gt_bb_entry, pd_bb_entry): + self.gt_bb += gt_bb_entry + self.pd_bb += pd_bb_entry + + del gt_bb_entry + del pd_bb_entry + self.results = get_pascal_voc_metrics( + self.gt_bb, self.pd_bb, self.nms_iou_threshold + ) + self.mAP = MetricPerClass.mAP(self.results) + + def get_accuracy_score(self): + return self.mAP diff --git a/qai_hub_models/evaluators/image_evaluator.py b/qai_hub_models/evaluators/image_evaluator.py new file mode 100644 index 00000000..40088433 --- /dev/null +++ b/qai_hub_models/evaluators/image_evaluator.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator + + +class SegmentationOutputEvaluator(BaseEvaluator): + """Evaluator for comparing a batched image output.""" + + def __init__(self, num_classes): + self.num_classes = num_classes + self.reset() + + def add_batch(self, output: torch.Tensor, gt: torch.Tensor): + # This evaluator supports only 1 output tensor at a time. + assert gt.shape == output.shape + self.confusion_matrix += self._generate_matrix(gt, output) + + def reset(self): + self.confusion_matrix = torch.zeros((self.num_classes, self.num_classes)) + + def Pixel_Accuracy(self): + Acc = torch.diag(self.confusion_matrix).sum() / self.confusion_matrix.sum() + return Acc + + def Pixel_Accuracy_Class(self): + Acc = torch.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=1) + Acc = torch.nanmean(Acc) + return Acc + + def Intersection_over_Union(self): + return torch.diag(self.confusion_matrix) / ( + torch.sum(self.confusion_matrix, axis=1) + + torch.sum(self.confusion_matrix, axis=0) + - torch.diag(self.confusion_matrix) + ) + + def Mean_Intersection_over_Union(self): + return torch.nanmean(self.Intersection_over_Union()) + + def Frequency_Weighted_Intersection_over_Union(self): + freq = torch.sum(self.confusion_matrix, axis=1) / torch.sum( + self.confusion_matrix + ) + iu = torch.diag(self.confusion_matrix) / ( + torch.sum(self.confusion_matrix, axis=1) + + torch.sum(self.confusion_matrix, axis=0) + - torch.diag(self.confusion_matrix) + ) + + FWIoU = (freq[freq > 0] * iu[freq > 0]).sum() + return FWIoU + + def _generate_matrix(self, gt_image, pre_image): + mask = (gt_image >= 0) & (gt_image < self.num_classes) + label = self.num_classes * gt_image[mask].int() + pre_image[mask] + count = torch.bincount(label, minlength=self.num_classes**2) + confusion_matrix = count.reshape(self.num_classes, self.num_classes) + return confusion_matrix diff --git a/qai_hub_models/evaluators/superres_evaluator.py b/qai_hub_models/evaluators/superres_evaluator.py new file mode 100644 index 00000000..4e40fe20 --- /dev/null +++ b/qai_hub_models/evaluators/superres_evaluator.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import numpy as np +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator + + +class SuperResolutionOutputEvaluator(BaseEvaluator): + """Evaluator for comparing a batched image output.""" + + def __init__(self): + self.psnr_list = [] + self.reset() + + def _rgb_to_yuv(self, img): + # Convert to YUV as this is closer to human perception, + # so PSNR will be more meaningful + # Source: + # https://github.com/quic/aimet-model-zoo/blob/main/aimet_zoo_torch/common/super_resolution/psnr.py#L18 + rgb_weights = np.array([65.481, 128.553, 24.966]) + img = np.matmul(img, rgb_weights) + 16.0 + + return img + + def _compute_psnr(self, img, gt): + # Compute PSNR between two images + # Assumed that they are in YUV format + diff = (img - gt) ** 2 + error = np.mean(diff) + eps = 1e-8 # a tiny amount to ensure no division by 0 + data_range = 255.0 # 8-bit data range + + return 10 * np.log10((data_range**2) / (error + eps)) + + def add_batch(self, output: torch.Tensor, gt: torch.Tensor): + assert gt.shape == output.shape + + output = output.detach() + gt = gt.detach() + + batch_size = gt.shape[0] + + for i in range(batch_size): + # Convert each to HWC and YUV for PSNR + pred = output[i].permute((1, 2, 0)).numpy() + truth = gt[i].permute((1, 2, 0)).numpy() + + pred = self._rgb_to_yuv(pred) + truth = self._rgb_to_yuv(truth) + + psnr = self._compute_psnr(pred, truth) + self.psnr_list.append(psnr.item()) + + def reset(self): + self.psnr_list = [] + + def compute_average_psnr(self): + average_psnr = np.mean(np.array(self.psnr_list)) + return average_psnr + + def get_accuracy_score(self) -> float: + return self.compute_average_psnr() diff --git a/qai_hub_models/models/__init__.py b/qai_hub_models/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/__init__.py b/qai_hub_models/models/_shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/cityscapes_segmentation/__init__.py b/qai_hub_models/models/_shared/cityscapes_segmentation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/cityscapes_segmentation/app.py b/qai_hub_models/models/_shared/cityscapes_segmentation/app.py new file mode 100644 index 00000000..490fe677 --- /dev/null +++ b/qai_hub_models/models/_shared/cityscapes_segmentation/app.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +import os +from typing import Optional + +import numpy as np +import torch +import torch.nn.functional as F +import torchvision.transforms as standard_transforms +from PIL import Image as ImageModule +from PIL.Image import Image + +from qai_hub_models.models._shared.cityscapes_segmentation.model import ( + CITYSCAPES_MEAN, + CITYSCAPES_STD, + FFNET_SOURCE_PATCHES, + FFNET_SOURCE_REPO_COMMIT, + FFNET_SOURCE_REPOSITORY, + FFNET_SOURCE_VERSION, + MODEL_ASSET_VERSION, + MODEL_ID, +) +from qai_hub_models.utils.asset_loaders import ASSET_CONFIG, SourceAsRoot + + +def _load_cityscapes_loader(cityscapes_path: Optional[str] = None) -> object: + if cityscapes_path is None: + # Allow a loader without data. There are useful auxiliary functions. + cityscapes_path = ASSET_CONFIG.get_local_store_model_path( + MODEL_ID, + MODEL_ASSET_VERSION, + "cityscapes_dummy", + ) + + os.makedirs( + os.path.join(cityscapes_path, "leftImg8bit", "train"), exist_ok=True + ) + os.makedirs(os.path.join(cityscapes_path, "leftImg8bit", "val"), exist_ok=True) + + # Resolve absolute path outside SourceAsRoot, since cwd changes + cityscapes_path = os.path.abspath(cityscapes_path) + + with SourceAsRoot( + FFNET_SOURCE_REPOSITORY, + FFNET_SOURCE_REPO_COMMIT, + MODEL_ID, + FFNET_SOURCE_VERSION, + source_repo_patches=FFNET_SOURCE_PATCHES, + ): + import config + + config.cityscapes_base_path = cityscapes_path + from ffnet_datasets.cityscapes.dataloader.get_dataloaders import ( + return_dataloader, + ) + + dataloader = return_dataloader(num_workers=1, batch_size=1) + return dataloader + + +def preprocess_cityscapes_image(image: Image) -> torch.Tensor: + transform = standard_transforms.Compose( + [ + standard_transforms.ToTensor(), + standard_transforms.Normalize(CITYSCAPES_MEAN, CITYSCAPES_STD), + ] + ) + out_tensor: torch.Tensor = transform(image) # type: ignore + return out_tensor.unsqueeze(0) + + +class CityscapesSegmentationApp: + """ + This class consists of light-weight "app code" that is required to perform + end to end inference for single-view (left) semantic segmentation of the + Cityscapes (https://cityscapes-dataset.com/) dataset. + + The app uses 1 model: + * Cityscapes segmentation model + + For a given image input, the app will: + * Pre-process the image + * Run model inference + * Resize predictions to map image size + * Visualize results by super-imposing on input image + """ + + def __init__( + self, + model: torch.nn.Module, + ): + self.model = model + self.color_mapping = _load_cityscapes_loader().dataset.color_mapping + + def predict(self, image: Image, raw_output: bool = False) -> Image | np.ndarray: + """ + From the provided image or tensor, predict semantic segmentation over + the Cityscapes classes. + + Parameters: + image: A PIL Image in RGB format. + + Returns: + If raw_output is False it will return an annotated image of the + same size as the input image. If True, it will return raw logit + probabilities as an numpy array of shape [1, CLASSES, HEIGHT, + WIDTH]. Note, that WIDTH and HEIGHT will be smaller than the input + image. + """ + + input_tensor = preprocess_cityscapes_image(image) + with torch.no_grad(): + small_res_output = self.model(input_tensor) + + output = F.interpolate( + small_res_output, (image.height, image.width), mode="bilinear" + ) + if raw_output: + return output.detach().numpy() + predictions = output[0].argmax(0).byte().cpu().numpy() + + color_mask = ImageModule.fromarray(predictions.astype(np.uint8)).convert("P") + color_mask.putpalette(self.color_mapping) + out = ImageModule.blend(image, color_mask.convert("RGB"), 0.5) + + return out diff --git a/qai_hub_models/models/_shared/cityscapes_segmentation/demo.py b/qai_hub_models/models/_shared/cityscapes_segmentation/demo.py new file mode 100644 index 00000000..b5928084 --- /dev/null +++ b/qai_hub_models/models/_shared/cityscapes_segmentation/demo.py @@ -0,0 +1,78 @@ +import os +from typing import Type + +from qai_hub_models.models._shared.cityscapes_segmentation.app import ( + CityscapesSegmentationApp, +) +from qai_hub_models.models._shared.cityscapes_segmentation.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + CityscapesSegmentor, +) +from qai_hub_models.utils.args import ( + TargetRuntime, + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image +from qai_hub_models.utils.image_processing import pil_resize_pad, pil_undo_resize_pad + +# This image showcases the Cityscapes classes (but is not from the dataset) +TEST_CITYSCAPES_LIKE_IMAGE_NAME = "cityscapes_like_demo_2048x1024.jpg" +TEST_CITYSCAPES_LIKE_IMAGE_ASSET = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, TEST_CITYSCAPES_LIKE_IMAGE_NAME +) + + +# Run Imagenet Classifier end-to-end on a sample image. +# The demo will print the predicted class to terminal. +def cityscapes_segmentation_demo( + model_type: Type[CityscapesSegmentor], + model_id: str, + is_test: bool = False, +): + # Demo parameters + parser = get_model_cli_parser(model_type) + parser = get_on_device_demo_parser( + parser, available_target_runtimes=[TargetRuntime.TFLITE], add_output_dir=True + ) + parser.add_argument( + "--image", + type=str, + help="File path or URL to an input image to use for the demo.", + ) + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, model_type.get_model_id()) + + if args.image is None: + image = TEST_CITYSCAPES_LIKE_IMAGE_ASSET.fetch() + image_name = TEST_CITYSCAPES_LIKE_IMAGE_NAME + else: + image = args.image + image_name = os.path.basename(image) + + input_spec = model_type.get_input_spec() + + inference_model = demo_model_from_cli_args(model_type, args) + app = CityscapesSegmentationApp(inference_model) + + (_, _, height, width) = input_spec["image"][0] + orig_image = load_image(image) + image, _, padding = pil_resize_pad(orig_image, (height, width)) + + # Run app + image_annotated = app.predict(image) + + # Resize / unpad annotated image + image_annotated = pil_undo_resize_pad(image_annotated, orig_image.size, padding) + + if not is_test: + display_or_save_image( + image_annotated, + args.output_dir, + "annotated_" + image_name, + "predicted image", + ) diff --git a/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py b/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py new file mode 100644 index 00000000..9bc30e4f --- /dev/null +++ b/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py @@ -0,0 +1,18 @@ +import torch.nn.functional as F +from torch import Tensor + +from qai_hub_models.evaluators.image_evaluator import SegmentationOutputEvaluator + + +class CityscapesSegmentationEvaluator(SegmentationOutputEvaluator): + """ + Evaluates the output of Cityscapes semantics segmentation. + """ + + def add_batch(self, output: Tensor, gt: Tensor): + output_match_size = F.interpolate(output, gt.shape[1:3], mode="bilinear") + output_class = output_match_size.argmax(1).cpu() + return super().add_batch(output_class, gt) + + def get_accuracy_score(self) -> float: + return super().Mean_Intersection_over_Union() diff --git a/qai_hub_models/models/_shared/cityscapes_segmentation/model.py b/qai_hub_models/models/_shared/cityscapes_segmentation/model.py new file mode 100644 index 00000000..24514c44 --- /dev/null +++ b/qai_hub_models/models/_shared/cityscapes_segmentation/model.py @@ -0,0 +1,88 @@ +import os + +import torch +from torch import nn + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.models._shared.cityscapes_segmentation.evaluator import ( + CityscapesSegmentationEvaluator, +) +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +# The FFNet repo contains some utility functions for Cityscapes, so the +# repo source lives here +FFNET_SOURCE_REPOSITORY = "https://github.com/Qualcomm-AI-research/FFNet.git" +FFNET_SOURCE_REPO_COMMIT = "0887620d3d570b0848c40ce6db6f048a128ee58a" +FFNET_SOURCE_PATCHES = [ + os.path.abspath( + os.path.join(os.path.dirname(__file__), "patches", "move_datasets.diff") + ) +] +FFNET_SOURCE_VERSION = 2 # bump if repo/sha/patches are updated + +MODEL_ASSET_VERSION = 1 +MODEL_ID = __name__.split(".")[-2] +CITYSCAPES_NUM_CLASSES = 19 +CITYSCAPES_IGNORE_LABEL = 255 +# Cityscapes has 30 classes, but only 19 are in use +CITYSCAPES_LABELS = [ + "road", + "sidewalk", + "building", + "wall", + "fence", + "pole", + "traffic light", + "traffic sign", + "vegetation", + "terrain", + "sky", + "person", + "rider", + "car", + "truck", + "bus", + "train", + "motorcycle", + "bicycle", +] +CITYSCAPES_MEAN = [0.485, 0.456, 0.406] +CITYSCAPES_STD = [0.229, 0.224, 0.225] + + +class CityscapesSegmentor(BaseModel): + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + def get_evaluator(self) -> BaseEvaluator: + return CityscapesSegmentationEvaluator(CITYSCAPES_NUM_CLASSES) + + def forward(self, image: torch.Tensor): + """ + Predict semantic segmentation an input `image`. + + Parameters: + image: A [1, 3, height, width] image. + Assumes image has been resized and normalized using the + Cityscapes preprocesser (in cityscapes_segmentation/app.py). + + Returns: + A [1, 1000] where each value is the log-likelihood of + the image belonging to the corresponding Imagenet class. + """ + return self.model(image) + + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + height: int = 1024, + width: int = 2048, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a compile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} diff --git a/qai_hub_models/models/_shared/cityscapes_segmentation/patches/move_datasets.diff b/qai_hub_models/models/_shared/cityscapes_segmentation/patches/move_datasets.diff new file mode 100644 index 00000000..869a86b8 --- /dev/null +++ b/qai_hub_models/models/_shared/cityscapes_segmentation/patches/move_datasets.diff @@ -0,0 +1,182 @@ +commit 893641b40d95d14bc1da70e404b43754b6784ab8 +Author: Gustav Larsson +Date: Wed Feb 14 09:26:52 2024 -0800 + + datasets -> ffnet_datasets + + This prevents collision with popular HF package. + +diff --git a/datasets/.DS_Store b/ffnet_datasets/.DS_Store +similarity index 100% +rename from datasets/.DS_Store +rename to ffnet_datasets/.DS_Store +diff --git a/datasets/cityscapes/.DS_Store b/ffnet_datasets/cityscapes/.DS_Store +similarity index 100% +rename from datasets/cityscapes/.DS_Store +rename to ffnet_datasets/cityscapes/.DS_Store +diff --git a/datasets/cityscapes/attribution.txt b/ffnet_datasets/cityscapes/attribution.txt +similarity index 100% +rename from datasets/cityscapes/attribution.txt +rename to ffnet_datasets/cityscapes/attribution.txt +diff --git a/datasets/cityscapes/cityscapes.py b/ffnet_datasets/cityscapes/cityscapes.py +similarity index 86% +rename from datasets/cityscapes/cityscapes.py +rename to ffnet_datasets/cityscapes/cityscapes.py +index a441c91..f43c98c 100644 +--- a/datasets/cityscapes/cityscapes.py ++++ b/ffnet_datasets/cityscapes/cityscapes.py +@@ -1,6 +1,6 @@ + import os + import os.path as path +-import datasets.cityscapes.cityscapes_labels as cityscapes_labels ++import ffnet_datasets.cityscapes.cityscapes_labels as cityscapes_labels + + + def find_directories(root): +diff --git a/datasets/cityscapes/cityscapes_labels.py b/ffnet_datasets/cityscapes/cityscapes_labels.py +similarity index 100% +rename from datasets/cityscapes/cityscapes_labels.py +rename to ffnet_datasets/cityscapes/cityscapes_labels.py +diff --git a/datasets/cityscapes/dataloader/__init__.py b/ffnet_datasets/cityscapes/dataloader/__init__.py +similarity index 100% +rename from datasets/cityscapes/dataloader/__init__.py +rename to ffnet_datasets/cityscapes/dataloader/__init__.py +diff --git a/datasets/cityscapes/dataloader/base_loader.py b/ffnet_datasets/cityscapes/dataloader/base_loader.py +similarity index 98% +rename from datasets/cityscapes/dataloader/base_loader.py +rename to ffnet_datasets/cityscapes/dataloader/base_loader.py +index b503b8a..f1a1b37 100644 +--- a/datasets/cityscapes/dataloader/base_loader.py ++++ b/ffnet_datasets/cityscapes/dataloader/base_loader.py +@@ -32,9 +32,9 @@ import torch + from PIL import Image + from torch.utils import data + from config import CITYSCAPES_IGNORE_LABEL, CITYSCAPES_NUM_CLASSES, cityscapes_base_path +-from datasets.cityscapes.utils.misc import tensor_to_pil +-from datasets.cityscapes.cityscapes import find_directories +-import datasets.cityscapes.cityscapes_labels as cityscapes_labels ++from ffnet_datasets.cityscapes.utils.misc import tensor_to_pil ++from ffnet_datasets.cityscapes.cityscapes import find_directories ++import ffnet_datasets.cityscapes.cityscapes_labels as cityscapes_labels + from scipy.ndimage.morphology import distance_transform_edt + + +diff --git a/datasets/cityscapes/dataloader/get_dataloaders.py b/ffnet_datasets/cityscapes/dataloader/get_dataloaders.py +similarity index 84% +rename from datasets/cityscapes/dataloader/get_dataloaders.py +rename to ffnet_datasets/cityscapes/dataloader/get_dataloaders.py +index 347f7db..5596f5a 100644 +--- a/datasets/cityscapes/dataloader/get_dataloaders.py ++++ b/ffnet_datasets/cityscapes/dataloader/get_dataloaders.py +@@ -1,11 +1,11 @@ +-# import datasets.cityscapes.dataloader.joint_transforms as joint_transforms +-import datasets.cityscapes.dataloader.transforms as extended_transforms ++# import ffnet_datasets.cityscapes.dataloader.joint_transforms as joint_transforms ++import ffnet_datasets.cityscapes.dataloader.transforms as extended_transforms + from torch.utils.data import DataLoader + + import importlib + import torchvision.transforms as standard_transforms + from config import CITYSCAPES_MEAN, CITYSCAPES_STD +-from datasets.cityscapes.dataloader.base_loader import Cityscapes ++from ffnet_datasets.cityscapes.dataloader.base_loader import Cityscapes + + + def return_dataloader(num_workers, batch_size): +diff --git a/datasets/cityscapes/dataloader/sampler.py b/ffnet_datasets/cityscapes/dataloader/sampler.py +similarity index 100% +rename from datasets/cityscapes/dataloader/sampler.py +rename to ffnet_datasets/cityscapes/dataloader/sampler.py +diff --git a/datasets/cityscapes/dataloader/transforms.py b/ffnet_datasets/cityscapes/dataloader/transforms.py +similarity index 100% +rename from datasets/cityscapes/dataloader/transforms.py +rename to ffnet_datasets/cityscapes/dataloader/transforms.py +diff --git a/datasets/cityscapes/utils/__init__.py b/ffnet_datasets/cityscapes/utils/__init__.py +similarity index 100% +rename from datasets/cityscapes/utils/__init__.py +rename to ffnet_datasets/cityscapes/utils/__init__.py +diff --git a/datasets/cityscapes/utils/attr_dict.py b/ffnet_datasets/cityscapes/utils/attr_dict.py +similarity index 100% +rename from datasets/cityscapes/utils/attr_dict.py +rename to ffnet_datasets/cityscapes/utils/attr_dict.py +diff --git a/datasets/cityscapes/utils/misc.py b/ffnet_datasets/cityscapes/utils/misc.py +similarity index 99% +rename from datasets/cityscapes/utils/misc.py +rename to ffnet_datasets/cityscapes/utils/misc.py +index 26a4f59..df84db8 100644 +--- a/datasets/cityscapes/utils/misc.py ++++ b/ffnet_datasets/cityscapes/utils/misc.py +@@ -9,7 +9,7 @@ import numpy as np + + import torchvision.transforms as standard_transforms + import torchvision.utils as vutils +-from datasets.cityscapes import cityscapes_labels ++from ffnet_datasets.cityscapes import cityscapes_labels + + # from tabulate import tabulate + from PIL import Image +diff --git a/datasets/cityscapes/utils/my_data_parallel.py b/ffnet_datasets/cityscapes/utils/my_data_parallel.py +similarity index 100% +rename from datasets/cityscapes/utils/my_data_parallel.py +rename to ffnet_datasets/cityscapes/utils/my_data_parallel.py +diff --git a/datasets/cityscapes/utils/progress_bar.py b/ffnet_datasets/cityscapes/utils/progress_bar.py +similarity index 100% +rename from datasets/cityscapes/utils/progress_bar.py +rename to ffnet_datasets/cityscapes/utils/progress_bar.py +diff --git a/datasets/cityscapes/utils/trnval_utils.py b/ffnet_datasets/cityscapes/utils/trnval_utils.py +similarity index 96% +rename from datasets/cityscapes/utils/trnval_utils.py +rename to ffnet_datasets/cityscapes/utils/trnval_utils.py +index 7bff368..5da25e3 100644 +--- a/datasets/cityscapes/utils/trnval_utils.py ++++ b/ffnet_datasets/cityscapes/utils/trnval_utils.py +@@ -31,10 +31,10 @@ import os + import torch + + from config import CITYSCAPES_IGNORE_LABEL, CITYSCAPES_NUM_CLASSES +-from datasets.cityscapes.utils.misc import fast_hist, fmt_scale ++from ffnet_datasets.cityscapes.utils.misc import fast_hist, fmt_scale + +-# from datasets.cityscapes.utils.misc import AverageMeter, eval_metrics +-# from datasets.cityscapes.utils.misc import metrics_per_image ++# from ffnet_datasets.cityscapes.utils.misc import AverageMeter, eval_metrics ++# from ffnet_datasets.cityscapes.utils.misc import metrics_per_image + import numpy as np + + +diff --git a/datasets/imagenet/imagenet_data_loader.py b/ffnet_datasets/imagenet/imagenet_data_loader.py +similarity index 100% +rename from datasets/imagenet/imagenet_data_loader.py +rename to ffnet_datasets/imagenet/imagenet_data_loader.py +diff --git a/scripts/evaluate_cityscapes.py b/scripts/evaluate_cityscapes.py +index 158daa6..afcfd11 100644 +--- a/scripts/evaluate_cityscapes.py ++++ b/scripts/evaluate_cityscapes.py +@@ -11,10 +11,10 @@ import numpy as np + import torch + import os + import sys +-from datasets.cityscapes.utils.misc import AverageMeter, eval_metrics +-from datasets.cityscapes.utils.trnval_utils import eval_minibatch +-from datasets.cityscapes.utils.progress_bar import printProgressBar +-from datasets.cityscapes.dataloader.get_dataloaders import return_dataloader ++from ffnet_datasets.cityscapes.utils.misc import AverageMeter, eval_metrics ++from ffnet_datasets.cityscapes.utils.trnval_utils import eval_minibatch ++from ffnet_datasets.cityscapes.utils.progress_bar import printProgressBar ++from ffnet_datasets.cityscapes.dataloader.get_dataloaders import return_dataloader + import warnings + + # from config import cityscapes_base_path +diff --git a/scripts/evaluate_imagenet.py b/scripts/evaluate_imagenet.py +index 4de201f..a7fae7f 100644 +--- a/scripts/evaluate_imagenet.py ++++ b/scripts/evaluate_imagenet.py +@@ -27,7 +27,7 @@ import torch.nn.functional as F + + torch.backends.cudnn.benchmark = True + from config import imagenet_base_path +-from datasets.imagenet.imagenet_data_loader import get_data_loader ++from ffnet_datasets.imagenet.imagenet_data_loader import get_data_loader + from models.model_registry import model_entrypoint + + diff --git a/qai_hub_models/models/_shared/common.py b/qai_hub_models/models/_shared/common.py new file mode 100644 index 00000000..455bb73b --- /dev/null +++ b/qai_hub_models/models/_shared/common.py @@ -0,0 +1,22 @@ +from typing import Type + +import torch + + +def replace_module_recursively( + module: torch.nn.Module, + tgt_cls: Type[torch.nn.Module], + new_cls: Type[torch.nn.Module], + parent_module: Type[torch.nn.Module] = None, +): + """ + Replace all instances of `tgt_cls` with `new_cls`. If `parent_module` is + specified, `tgt_cls` instance must be an immediate member of + `parent_module` (useful for limiting replacement scope) + """ + for name, child in module.named_children(): + if isinstance(child, tgt_cls): + if parent_module is None or isinstance(module, parent_module): + setattr(module, name, new_cls(child)) + else: + replace_module_recursively(child, tgt_cls, new_cls) diff --git a/qai_hub_models/models/_shared/deeplab/__init__.py b/qai_hub_models/models/_shared/deeplab/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/deeplab/app.py b/qai_hub_models/models/_shared/deeplab/app.py new file mode 100644 index 00000000..832b8a18 --- /dev/null +++ b/qai_hub_models/models/_shared/deeplab/app.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +from typing import Callable + +import numpy as np +import PIL.Image +import torch +from PIL.Image import Image +from torchvision import transforms + +from qai_hub_models.utils.draw import create_color_map +from qai_hub_models.utils.image_processing import normalize_image_transform + + +def preprocess_image(image: Image) -> torch.Tensor: + """ + Preprocesses images to be run through torch DeepLabV3 segmenter + as prescribed here: + https://pytorch.org/hub/pytorch_vision_resnet/ + + Parameters: + image: Input image to be run through the classifier model. + + Returns: + torch tensor to be directly passed to the model. + """ + transform = transforms.Compose( + [ + transforms.ToTensor(), + normalize_image_transform(), + ] + ) + out_tensor: torch.Tensor = transform(image) # type: ignore + return out_tensor.unsqueeze(0) + + +class DeepLabV3App: + """ + This class consists of light-weight "app code" that is required to + perform end to end inference with DeepLabV3. + + For a given image input, the app will: + * Pre-process the image (normalize) + * Run image segmentation + * Convert the raw output into probabilities using softmax + """ + + def __init__(self, model: Callable[[torch.Tensor], torch.Tensor], num_classes: int): + self.model = model + self.num_classes = num_classes + + def predict(self, image: Image, raw_output: bool = False) -> Image | np.ndarray: + """ + From the provided image or tensor, segment the image + + Parameters: + image: A PIL Image in RGB format. + + Returns: + If raw_output is true, returns: + masks: np.ndarray + A list of predicted masks. + + Otherwise, returns: + segmented_images: List[PIL.Image] + Images with segmentation map overlaid with an alpha of 0.5. + """ + + input_tensor = preprocess_image(image) + with torch.no_grad(): + output = self.model(input_tensor) + output = output[0] + predictions = output.argmax(0).byte().cpu().numpy() + + if raw_output: + return predictions + + color_map = create_color_map(self.num_classes) + out = PIL.Image.blend(image, PIL.Image.fromarray(color_map[predictions]), 0.5) + + return out diff --git a/qai_hub_models/models/_shared/deeplab/demo.py b/qai_hub_models/models/_shared/deeplab/demo.py new file mode 100644 index 00000000..ab2e9d24 --- /dev/null +++ b/qai_hub_models/models/_shared/deeplab/demo.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from typing import Type + +from qai_hub_models.models._shared.deeplab.app import DeepLabV3App +from qai_hub_models.utils.args import ( + add_output_dir_arg, + get_model_cli_parser, + model_from_cli_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_image +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.display import display_or_save_image + + +def deeplabv3_demo( + model_type: Type[BaseModel], + default_image: str | CachedWebAsset, + num_classes: int, + is_test: bool, +): + # Demo parameters + parser = get_model_cli_parser(model_type) + parser.add_argument( + "--image", + type=str, + default=default_image, + help="image file path or URL.", + ) + add_output_dir_arg(parser) + args = parser.parse_args([] if is_test else None) + + # This DeepLabV3 ResNet 50 demo comes from + # https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101/ + # load image and model + image = load_image(args.image) + input_image = image.convert("RGB") + app = DeepLabV3App(model_from_cli_args(model_type, args), num_classes=num_classes) + output = app.predict(input_image, False) + if not is_test: + display_or_save_image(output, args.output_dir) diff --git a/qai_hub_models/models/_shared/deeplab/evaluator.py b/qai_hub_models/models/_shared/deeplab/evaluator.py new file mode 100644 index 00000000..cab03865 --- /dev/null +++ b/qai_hub_models/models/_shared/deeplab/evaluator.py @@ -0,0 +1,20 @@ +from torch import Tensor + +from qai_hub_models.evaluators.image_evaluator import SegmentationOutputEvaluator + + +class DeepLabV3Evaluator(SegmentationOutputEvaluator): + """ + Evaluates the output of DeepLabV3Plus + + Expected data format for this evaluator: + * output has the same shape & meaning as output of any deeplabV3 forward() function. + * gt is argmax'd on the first dimension (see add_batch). + """ + + def add_batch(self, output: Tensor, gt: Tensor): + output = output.argmax(1).cpu() + return super().add_batch(output, gt) + + def get_accuracy_score(self) -> float: + return super().Mean_Intersection_over_Union() diff --git a/qai_hub_models/models/_shared/detr/__init__.py b/qai_hub_models/models/_shared/detr/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/detr/app.py b/qai_hub_models/models/_shared/detr/app.py new file mode 100644 index 00000000..3937af5f --- /dev/null +++ b/qai_hub_models/models/_shared/detr/app.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from typing import Callable, Tuple + +import numpy as np +import torch +from PIL import Image +from transformers import DetrImageProcessor + +from qai_hub_models.models._shared.detr.coco_label_map import LABEL_MAP +from qai_hub_models.utils.bounding_box_processing import box_xywh_to_xyxy +from qai_hub_models.utils.draw import draw_box_from_xyxy +from qai_hub_models.utils.image_processing import app_to_net_image_inputs + + +class DETRApp: + """ + This class consists of light-weight "app code" that is required to + perform end to end inference with DETR. + + For a given image input, the app will: + * Preprocess the image (normalize, resize, etc) and get encoding to pass to the model. + * Run DETR Inference + * Convert the raw output into box coordinates and corresponding label and confidence. + """ + + def __init__( + self, + model: Callable[[torch.Tensor], torch.Tensor], + model_image_input_size: Tuple[int, int] | None = None, + ): + self.model = model + self.model_image_input_size = model_image_input_size + + def predict( + self, + image: Image.Image, + default_weights: str, + threshold: float = 0.9, + ) -> np.ndarray: + """ + From the provided image or tensor, generate the segmented mask. + + Parameters: + image: Tensor[B, 3, H, W] + A PIL Image in NCHW, RGB format. + default_weights: str + Default weights name for the model. + threshold: float + Prediction score threshold. + + + Returns: + numpy_array: Original image numpy array with the corresponding predictions. + score: Scores for every class per prediction where atleast + one prediction was above the threshold. + Shape is [Number of predictions above threshold] + label: Labels (class number) for the predicted class. + Shape is [Number of predictions above threshold] + box: Box coordinates (top left and bottom right) + Shape is [Number of predictions above threshold x top_left_x, top_left_y, bottom_right_x, bottom_right_y] + + """ + size = ( + { + "width": self.model_image_input_size[1], + "height": self.model_image_input_size[0], + } + if self.model_image_input_size + else None + ) + + image_processor = DetrImageProcessor.from_pretrained(default_weights, size=size) + encoding = image_processor(image, return_tensors="pt") + outputs = self.model(encoding["pixel_values"], encoding["pixel_mask"].float()) + target_sizes = torch.tensor(image.size[::-1]).unsqueeze(0) + + out_logits, out_bbox = outputs[0], outputs[1] + prob = torch.nn.functional.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # Convert to [x0, y0, x1, y1] format + boxes = box_xywh_to_xyxy(out_bbox.view(-1, 2, 2)).view(-1, 4) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + + NHWC_int_numpy_frames, NCHW_fp32_torch_frames = app_to_net_image_inputs(image) + for p, (xmin, ymin, xmax, ymax), l in zip(score, box.tolist(), label): + draw_box_from_xyxy( + NHWC_int_numpy_frames[0], + (int(xmin), int(ymin)), + (int(xmax), int(ymax)), + color=(0, 255, 0), + size=2, + text=f"{LABEL_MAP[l.item()]}: {p.item():0.2f}", + ) + + return NHWC_int_numpy_frames, score, label, box diff --git a/qai_hub_models/models/_shared/detr/coco_label_map.py b/qai_hub_models/models/_shared/detr/coco_label_map.py new file mode 100644 index 00000000..0259ef4d --- /dev/null +++ b/qai_hub_models/models/_shared/detr/coco_label_map.py @@ -0,0 +1,185 @@ +LABEL_MAP = { + 0: "unlabeled", + 1: "person", + 2: "bicycle", + 3: "car", + 4: "motorcycle", + 5: "airplane", + 6: "bus", + 7: "train", + 8: "truck", + 9: "boat", + 10: "traffic", + 11: "fire", + 12: "street", + 13: "stop", + 14: "parking", + 15: "bench", + 16: "bird", + 17: "cat", + 18: "dog", + 19: "horse", + 20: "sheep", + 21: "cow", + 22: "elephant", + 23: "bear", + 24: "zebra", + 25: "giraffe", + 26: "hat", + 27: "backpack", + 28: "umbrella", + 29: "shoe", + 30: "eye", + 31: "handbag", + 32: "tie", + 33: "suitcase", + 34: "frisbee", + 35: "skis", + 36: "snowboard", + 37: "sports", + 38: "kite", + 39: "baseball", + 40: "baseball", + 41: "skateboard", + 42: "surfboard", + 43: "tennis", + 44: "bottle", + 45: "plate", + 46: "wine", + 47: "cup", + 48: "fork", + 49: "knife", + 50: "spoon", + 51: "bowl", + 52: "banana", + 53: "apple", + 54: "sandwich", + 55: "orange", + 56: "broccoli", + 57: "carrot", + 58: "hot", + 59: "pizza", + 60: "donut", + 61: "cake", + 62: "chair", + 63: "couch", + 64: "potted", + 65: "bed", + 66: "mirror", + 67: "dining", + 68: "window", + 69: "desk", + 70: "toilet", + 71: "door", + 72: "tv", + 73: "laptop", + 74: "mouse", + 75: "remote", + 76: "keyboard", + 77: "cell", + 78: "microwave", + 79: "oven", + 80: "toaster", + 81: "sink", + 82: "refrigerator", + 83: "blender", + 84: "book", + 85: "clock", + 86: "vase", + 87: "scissors", + 88: "teddy", + 89: "hair", + 90: "toothbrush", + 91: "hair", + 92: "banner", + 93: "blanket", + 94: "branch", + 95: "bridge", + 96: "building", + 97: "bush", + 98: "cabinet", + 99: "cage", + 100: "cardboard", + 101: "carpet", + 102: "ceiling", + 103: "ceiling", + 104: "cloth", + 105: "clothes", + 106: "clouds", + 107: "counter", + 108: "cupboard", + 109: "curtain", + 110: "desk", + 111: "dirt", + 112: "door", + 113: "fence", + 114: "floor", + 115: "floor", + 116: "floor", + 117: "floor", + 118: "floor", + 119: "flower", + 120: "fog", + 121: "food", + 122: "fruit", + 123: "furniture", + 124: "grass", + 125: "gravel", + 126: "ground", + 127: "hill", + 128: "house", + 129: "leaves", + 130: "light", + 131: "mat", + 132: "metal", + 133: "mirror", + 134: "moss", + 135: "mountain", + 136: "mud", + 137: "napkin", + 138: "net", + 139: "paper", + 140: "pavement", + 141: "pillow", + 142: "plant", + 143: "plastic", + 144: "platform", + 145: "playingfield", + 146: "railing", + 147: "railroad", + 148: "river", + 149: "road", + 150: "rock", + 151: "roof", + 152: "rug", + 153: "salad", + 154: "sand", + 155: "sea", + 156: "shelf", + 157: "sky", + 158: "skyscraper", + 159: "snow", + 160: "solid", + 161: "stairs", + 162: "stone", + 163: "straw", + 164: "structural", + 165: "table", + 166: "tent", + 167: "textile", + 168: "towel", + 169: "tree", + 170: "vegetable", + 171: "wall", + 172: "wall", + 173: "wall", + 174: "wall", + 175: "wall", + 176: "wall", + 177: "wall", + 178: "water", + 179: "waterdrops", + 180: "window", + 181: "window", + 182: "wood", +} diff --git a/qai_hub_models/models/_shared/detr/demo.py b/qai_hub_models/models/_shared/detr/demo.py new file mode 100644 index 00000000..0aeae28e --- /dev/null +++ b/qai_hub_models/models/_shared/detr/demo.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from typing import Type + +from PIL import Image + +from qai_hub_models.models._shared.detr.app import DETRApp +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_image +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.display import display_or_save_image + + +# Run DETR app end-to-end on a sample image. +# The demo will display the predicted mask in a window. +def detr_demo( + model: Type[BaseModel], + default_weights: str, + default_image: str | CachedWebAsset, + is_test: bool = False, +): + # Demo parameters + parser = get_model_cli_parser(model) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=default_image, + help="test image file path or URL", + ) + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, model.get_model_id()) + + # Load image & model + detr = demo_model_from_cli_args(model, args) + + # Run app to scores, labels and boxes + img = load_image(args.image) + app = DETRApp(detr, model_image_input_size=[img.height, img.width]) + pred_images, _, _, _ = app.predict(img, default_weights) + pred_image = Image.fromarray(pred_images[0]) + + # Show the predicted boxes, scores and class names on the image. + if is_test: + assert isinstance(pred_image, Image.Image) + else: + display_or_save_image(pred_image, args.output_dir) diff --git a/qai_hub_models/models/_shared/detr/model.py b/qai_hub_models/models/_shared/detr/model.py new file mode 100644 index 00000000..03b9821a --- /dev/null +++ b/qai_hub_models/models/_shared/detr/model.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from typing import Tuple + +import torch +import torch.nn as nn +from transformers import DetrForObjectDetection + +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 + + +class DETR(BaseModel): + """Exportable DETR model, end-to-end.""" + + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + @classmethod + def from_pretrained(cls, ckpt_name: str): + model = DetrForObjectDetection.from_pretrained(ckpt_name) + model.eval() + return cls(model) + + def forward( + self, image: torch.Tensor, mask: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Run DETR on `image` and `mask`, and produce high quality detection results. + + Parameters: + image: Image tensor to run detection on. + mask: This represents the padding mask. True if padding was applied on that pixel else False. + + Returns: + predictions: Tuple of tensors (logits and coordinates) + Shape of logit tensor: [1, 100 (number of predictions), 92 (number of classes)] + Shape of coordinates: [1, 100, 4] + + """ + predictions = self.model(image, mask, return_dict=False) + return predictions + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 480, + width: int = 480, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm® AI Hub. + """ + return { + "image": ((batch_size, num_channels, height, width), "float32"), + "mask": ((batch_size, height, width), "float32"), + } diff --git a/qai_hub_models/models/_shared/fastsam/__init__.py b/qai_hub_models/models/_shared/fastsam/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/fastsam/app.py b/qai_hub_models/models/_shared/fastsam/app.py new file mode 100644 index 00000000..04a9a523 --- /dev/null +++ b/qai_hub_models/models/_shared/fastsam/app.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +from typing import Callable, Tuple + +import numpy as np +import torch +from PIL import Image +from ultralytics.engine.results import Results +from ultralytics.models.fastsam import FastSAMPrompt +from ultralytics.models.fastsam.utils import bbox_iou +from ultralytics.utils import ops + +from qai_hub_models.utils.image_processing import preprocess_PIL_image + + +class FastSAMApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with FastSAM. + + The app uses 1 model: + * FastSAM + + For a given image input, the app will: + * pre-process the image (convert to range[0, 1]) + * Run FastSAM inference + * post-process the image + * display the input and output side-by-side + """ + + def __init__( + self, + fastsam_model: Callable[[torch.Tensor], torch.Tensor], + confidence: float = 0.4, + iou_threshold: float = 0.9, + retina_masks: bool = True, + model_image_input_shape: Tuple[int, int] = (640, 640), + ): + self.model = fastsam_model + self.confidence = confidence + self.iou_threshold = iou_threshold + self.retina_masks = retina_masks + self.model_image_input_shape = model_image_input_shape + + def predict(self, *args, **kwargs): + # See upscale_image. + return self.segment_image(*args, **kwargs) + + def segment_image(self, image_path: str) -> Results: + """ + Upscale provided images + + Parameters: + pixel_values_or_image: torch.Tensor + Input PIL image (before pre-processing) or pyTorch tensor (after image pre-processing). + + Returns: + images: List[PIL.Image.Image] + A list of upscaled images (one for each input image). + """ + original_image = Image.open(image_path) + resized_image = original_image.resize( + (self.model_image_input_shape[0], self.model_image_input_shape[1]) + ) + img = preprocess_PIL_image(resized_image) + original_image = np.array(original_image) + image_path = [image_path] + preds = self.model(img) + preds = tuple( + (preds[0], tuple(([preds[1], preds[2], preds[3]], preds[4], preds[5]))) + ) + p = ops.non_max_suppression( + preds[0], + self.confidence, + self.iou_threshold, + agnostic=False, + max_det=100, + nc=1, # set to 1 class since SAM has no class predictions + classes=None, + ) + + full_box = torch.zeros(p[0].shape[1], device=p[0].device) + full_box[2], full_box[3], full_box[4], full_box[6:] = ( + img.shape[3], + img.shape[2], + 1.0, + 1.0, + ) + full_box = full_box.view(1, -1) + critical_iou_index = bbox_iou( + full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:] + ) + if critical_iou_index.numel() != 0: + full_box[0][4] = p[0][critical_iou_index][:, 4] + full_box[0][6:] = p[0][critical_iou_index][:, 6:] + p[0][critical_iou_index] = full_box + + results = [] + proto = ( + preds[1][-1] if len(preds[1]) == 3 else preds[1] + ) # second output is len 3 if pt, but only 1 if exported + for i, pred in enumerate(p): + orig_img = original_image + img_path = image_path[0][i] + # No predictions, no masks + if not len(pred): + masks = None + elif self.retina_masks: + pred[:, :4] = ops.scale_boxes( + img.shape[2:], pred[:, :4], orig_img.shape + ) + + masks = ops.process_mask_native( + proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2] + ) # HWC + else: + masks = ops.process_mask( + proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True + ) # HWC + pred[:, :4] = ops.scale_boxes( + img.shape[2:], pred[:, :4], orig_img.shape + ) + results.append( + Results( + orig_img, + path=img_path, + names="fastsam", + boxes=pred[:, :6], + masks=masks, + ) + ) + prompt_process = FastSAMPrompt(image_path[0], results, device="cpu") + segmented_result = prompt_process.everything_prompt() + return segmented_result, prompt_process diff --git a/qai_hub_models/models/_shared/fastsam/demo.py b/qai_hub_models/models/_shared/fastsam/demo.py new file mode 100644 index 00000000..c244eca4 --- /dev/null +++ b/qai_hub_models/models/_shared/fastsam/demo.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +import os +import tempfile +from typing import Type + +from PIL import Image + +from qai_hub_models.models._shared.fastsam.app import FastSAMApp +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_path +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.display import display_or_save_image + + +def fastsam_demo( + model_type: Type[BaseModel], image_path: str | CachedWebAsset, is_test: bool +): + # Demo parameters + parser = get_model_cli_parser(model_type) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=image_path, + help="image file path or URL.", + ) + + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, model_type.get_model_id()) + + model = demo_model_from_cli_args(model_type, args) + app = FastSAMApp(model) + + with tempfile.TemporaryDirectory() as tmpdir: + image_path = load_path(args.image, tmpdir) + pred, prompt_process = app.segment_image(image_path) + + # Store the output image + output_dirname, _ = os.path.split(image_path) + output_path = os.path.join(output_dirname, "output.jpg") + prompt_process.plot(annotations=pred, output=output_path) + + # Display the output + output_image = Image.open(output_path) + if not is_test: + display_or_save_image(output_image, args.output_dir) diff --git a/qai_hub_models/models/_shared/fastsam/model.py b/qai_hub_models/models/_shared/fastsam/model.py new file mode 100644 index 00000000..8dbb384f --- /dev/null +++ b/qai_hub_models/models/_shared/fastsam/model.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import torch +import torch.nn as nn +from ultralytics import FastSAM + +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + + +class Fast_SAM(BaseModel): + """Exportable FastSAM model, end-to-end.""" + + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + @classmethod + def from_pretrained(cls, ckpt_name: str): + model = FastSAM(ckpt_name).model + model.eval() + return cls(model) + + def forward(self, image: torch.Tensor): + """ + Run FastSAM on `image`, and produce high quality segmentation masks. + Faster than SAM as it is based on YOLOv8. + + Parameters: + image: Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: BGR + Returns: + + """ + predictions = self.model(image) + # Return predictions as a tuple instead of nested tuple. + return ( + predictions[0], + predictions[1][0][0], + predictions[1][0][1], + predictions[1][0][2], + predictions[1][1], + predictions[1][2], + ) + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 640, + width: int = 640, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm® AI Hub. + """ + return {"image": ((batch_size, num_channels, height, width), "float32")} diff --git a/qai_hub_models/models/_shared/ffnet/__init__.py b/qai_hub_models/models/_shared/ffnet/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/ffnet/model.py b/qai_hub_models/models/_shared/ffnet/model.py new file mode 100644 index 00000000..61b1da4d --- /dev/null +++ b/qai_hub_models/models/_shared/ffnet/model.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import os + +import torch + +from qai_hub_models.models._shared.cityscapes_segmentation.model import ( + FFNET_SOURCE_PATCHES, + FFNET_SOURCE_REPO_COMMIT, + FFNET_SOURCE_REPOSITORY, + FFNET_SOURCE_VERSION, +) +from qai_hub_models.models._shared.cityscapes_segmentation.model import ( + MODEL_ID as CS_MODEL_ID, +) +from qai_hub_models.models._shared.cityscapes_segmentation.model import ( + CityscapesSegmentor, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, SourceAsRoot +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +FFNET_WEIGHTS_URL_ROOT = ( + "https://github.com/quic/aimet-model-zoo/releases/download/torch_segmentation_ffnet" +) +FFNET_SUBPATH_NAME_LOOKUP = { + # Variant name (in FFNet repo) to (subpath, src_name, dst_name) + "segmentation_ffnet40S_dBBB_mobile": ( + "ffnet40S", + "ffnet40S_dBBB_cityscapes_state_dict_quarts.pth", + "ffnet40S_dBBB_cityscapes_state_dict_quarts.pth", + ), + "segmentation_ffnet54S_dBBB_mobile": ( + "ffnet54S", + "ffnet54S_dBBB_cityscapes_state_dict_quarts.pth", + "ffnet54S_dBBB_cityscapes_state_dict_quarts.pth", + ), + "segmentation_ffnet78S_dBBB_mobile": ( + "ffnet78S", + "ffnet78S_dBBB_cityscapes_state_dict_quarts.pth", + "ffnet78S_dBBB_cityscapes_state_dict_quarts.pth", + ), + "segmentation_ffnet78S_BCC_mobile_pre_down": ( + "ffnet78S", + "ffnet78S_BCC_cityscapes_state_dict_quarts_pre_down.pth", + "ffnet78S_BCC_cityscapes_state_dict_quarts.pth", + ), + "segmentation_ffnet122NS_CCC_mobile_pre_down": ( + "ffnet122NS", + "ffnet122NS_CCC_cityscapes_state_dict_quarts_pre_down.pth", + "ffnet122NS_CCC_cityscapes_state_dict_quarts.pth", + ), +} + + +class FFNet(CityscapesSegmentor): + """Exportable FFNet fuss-free Cityscapes segmentation model.""" + + @classmethod + def from_pretrained(cls, variant_name: str) -> FFNet: + model = _load_ffnet_source_model(variant_name) + model.eval() + + return cls(model) + + +def _load_ffnet_source_model(variant_name) -> torch.nn.Module: + subpath, src_name, dst_name = FFNET_SUBPATH_NAME_LOOKUP[variant_name] + + weights_url = os.path.join(FFNET_WEIGHTS_URL_ROOT, src_name) + weights_path = CachedWebModelAsset( + weights_url, + MODEL_ID, + MODEL_ASSET_VERSION, + os.path.join(subpath, dst_name), + ).fetch() + root_weights_path = os.path.dirname(os.path.dirname(weights_path)) + + """ + orig_weights_path = download_data(weights_url, MODEL_ID) + + root_weights_path = os.path.dirname(orig_weights_path) + + # FFNet requires the weights to be located in a sub-directory + weights_path = os.path.join(root_weights_path, subpath, dst_name) + os.makedirs(os.path.dirname(weights_path), exist_ok=True) + shutil.move(src=orig_weights_path, dst=weights_path) + """ + + # Re-use the repo from _shared/cityscapes_segmentation + with SourceAsRoot( + FFNET_SOURCE_REPOSITORY, + FFNET_SOURCE_REPO_COMMIT, + CS_MODEL_ID, + FFNET_SOURCE_VERSION, + source_repo_patches=FFNET_SOURCE_PATCHES, + ): + + # config, models are top-level packages in the FFNet repo + import config + + config.model_weights_base_path = root_weights_path + from models.model_registry import model_entrypoint + + model = model_entrypoint(variant_name)().eval() + return model + + +class FFNetLowRes(FFNet): + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + height: int = 512, + width: int = 1024, + ) -> InputSpec: + return FFNet.get_input_spec(batch_size, num_channels, height, width) diff --git a/qai_hub_models/models/_shared/ffnet/test_utils.py b/qai_hub_models/models/_shared/ffnet/test_utils.py new file mode 100644 index 00000000..d3f21556 --- /dev/null +++ b/qai_hub_models/models/_shared/ffnet/test_utils.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import numpy as np +import torch + +from qai_hub_models.models._shared.cityscapes_segmentation.demo import ( + TEST_CITYSCAPES_LIKE_IMAGE_ASSET, +) +from qai_hub_models.models._shared.ffnet.model import FFNet, _load_ffnet_source_model +from qai_hub_models.utils.asset_loaders import load_image +from qai_hub_models.utils.image_processing import preprocess_PIL_image + + +def run_test_off_target_numerical( + ffnet_cls: FFNet, variant_name: str, relax_numerics: bool = False +): + """Verify that raw (numeric) outputs of both (qaism and non-qaism) networks are the same.""" + processed_sample_image = preprocess_PIL_image( + load_image(TEST_CITYSCAPES_LIKE_IMAGE_ASSET) + ) + source_model = _load_ffnet_source_model(variant_name) + qaism_model = ffnet_cls.from_pretrained() + + with torch.no_grad(): + source_out = source_model(processed_sample_image) + qaism_out = qaism_model(processed_sample_image) + + if relax_numerics: + # At least 90% of pixels should have original prediction + assert (source_out.argmax(1) == qaism_out.argmax(1)).float().mean() > 0.9 + else: + np.testing.assert_array_almost_equal(source_out, qaism_out) diff --git a/qai_hub_models/models/_shared/ffnet_quantized/__init__.py b/qai_hub_models/models/_shared/ffnet_quantized/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/ffnet_quantized/aimet_config.json b/qai_hub_models/models/_shared/ffnet_quantized/aimet_config.json new file mode 100644 index 00000000..c81ef65f --- /dev/null +++ b/qai_hub_models/models/_shared/ffnet_quantized/aimet_config.json @@ -0,0 +1,68 @@ +{ + "defaults": + { + "ops": + { + "is_output_quantized": "True" + }, + "params": + { + "is_quantized": "True", + "is_symmetric": "True" + }, + "strict_symmetric": "False", + "unsigned_symmetric": "True", + "per_channel_quantization": "True" + }, + + "params": + { + "bias": + { + "is_quantized": "False" + } + }, + + "op_type": + { + "Squeeze": + { + "is_output_quantized": "False" + }, + "Pad": + { + "is_output_quantized": "False" + }, + "Mean": + { + "is_output_quantized": "False" + } + }, + + "supergroups": + [ + { + "op_list": ["Conv", "Relu"] + }, + { + "op_list": ["Conv", "Clip"] + }, + { + "op_list": ["Conv", "BatchNormalization", "Relu"] + }, + { + "op_list": ["Add", "Relu"] + }, + { + "op_list": ["Gemm", "Relu"] + } + ], + + "model_input": + { + "is_input_quantized": "True" + }, + + "model_output": + {} +} diff --git a/qai_hub_models/models/_shared/ffnet_quantized/model.py b/qai_hub_models/models/_shared/ffnet_quantized/model.py new file mode 100644 index 00000000..95a35504 --- /dev/null +++ b/qai_hub_models/models/_shared/ffnet_quantized/model.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import os + +import torch +from aimet_torch.batch_norm_fold import fold_all_batch_norms +from aimet_torch.model_preparer import prepare_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim +from qai_hub.client import DatasetEntries + +from qai_hub_models.models._shared.ffnet.model import FFNet +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime +from qai_hub_models.utils.input_spec import InputSpec +from qai_hub_models.utils.quantization_aimet import AIMETQuantizableMixin + +MODEL_ID = __name__.split(".")[-2] +FFNET_AIMET_CONFIG = os.path.abspath( + os.path.join(os.path.dirname(__file__), "aimet_config.json") +) + + +class FFNetQuantizable(AIMETQuantizableMixin, FFNet): + """ + FFNet with post train quantization support. + + Supports only 8-bit weights and activations. + """ + + def __init__( + self, + ffnet_model: FFNet, + ) -> None: + FFNet.__init__(self, ffnet_model.model) + AIMETQuantizableMixin.__init__(self, ffnet_model) + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" + + @classmethod + def default_aimet_encodings(cls) -> str: + raise NotImplementedError() + + @classmethod + def from_pretrained( + cls, + variant_name: str, + aimet_encodings: str | None = "DEFAULT", + ) -> "FFNetQuantizable": + ffnet = FFNet.from_pretrained(variant_name).model + + input_shape = FFNetQuantizable.get_input_spec()["image"][0] + + fold_all_batch_norms(ffnet, [input_shape]) + + ffnet = prepare_model(ffnet) + + sim = QuantizationSimModel( + ffnet, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=FFNET_AIMET_CONFIG, + dummy_input=torch.rand(input_shape), + ) + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = cls.default_aimet_encodings() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) + + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + + def get_calibration_data( + self, target_runtime: TargetRuntime, input_spec: InputSpec | None = None + ) -> DatasetEntries | None: + # Do not provide calibration data + return None diff --git a/qai_hub_models/models/_shared/imagenet_classifier/__init__.py b/qai_hub_models/models/_shared/imagenet_classifier/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/imagenet_classifier/app.py b/qai_hub_models/models/_shared/imagenet_classifier/app.py new file mode 100644 index 00000000..2bcb55dd --- /dev/null +++ b/qai_hub_models/models/_shared/imagenet_classifier/app.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import torch +from PIL.Image import Image +from torchvision import transforms + +from qai_hub_models.models._shared.imagenet_classifier.model import ( + IMAGENET_DIM, + ImagenetClassifier, +) +from qai_hub_models.utils.image_processing import normalize_image_transform + +IMAGENET_TRANSFORM = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(IMAGENET_DIM), + transforms.ToTensor(), + normalize_image_transform(), + ] +) + + +def preprocess_image(image: Image) -> torch.Tensor: + """ + Preprocesses images to be run through torch imagenet classifiers + as prescribed here: + https://pytorch.org/hub/pytorch_vision_resnet/ + Parameters: + image: Input image to be run through the classifier model. + Returns: + torch tensor to be directly passed to the model. + """ + out_tensor: torch.Tensor = IMAGENET_TRANSFORM(image) # type: ignore + return out_tensor.unsqueeze(0) + + +class ImagenetClassifierApp: + """ + This class consists of light-weight "app code" that is required to + perform end to end inference with an ImagenetClassifier. + + For a given image input, the app will: + * Pre-process the image (resize and normalize) + * Run Imagnet Classification + * Convert the raw output into probabilities using softmax + """ + + def __init__(self, model: ImagenetClassifier): + self.model = model + + def predict(self, image: Image) -> torch.Tensor: + """ + From the provided image or tensor, predict probability distribution + over the 1k Imagenet classes. + + Parameters: + image: A PIL Image in RGB format. + + Returns: + A (1000,) size torch tensor of probabilities, each one corresponding + to a different Imagenet1K class. + """ + + input_tensor = preprocess_image(image) + with torch.no_grad(): + output = self.model(input_tensor) + return torch.softmax(output[0], dim=0) diff --git a/qai_hub_models/models/_shared/imagenet_classifier/demo.py b/qai_hub_models/models/_shared/imagenet_classifier/demo.py new file mode 100644 index 00000000..59aab954 --- /dev/null +++ b/qai_hub_models/models/_shared/imagenet_classifier/demo.py @@ -0,0 +1,61 @@ +from typing import Type + +import torch + +from qai_hub_models.models._shared.imagenet_classifier.app import ImagenetClassifierApp +from qai_hub_models.models._shared.imagenet_classifier.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ImagenetClassifier, +) +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + TEST_IMAGENET_IMAGE, +) +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_image, + load_json, +) + +IMAGENET_LABELS_ASSET = CachedWebModelAsset( + "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json", + MODEL_ID, + MODEL_ASSET_VERSION, + "imagenet_labels.json", +) + + +# Run Imagenet Classifier end-to-end on a sample image. +# The demo will print the predicted class to terminal. +def imagenet_demo(model_cls: Type[ImagenetClassifier], is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(model_cls) + parser = get_on_device_demo_parser(parser) + parser.add_argument( + "--image", + type=str, + default=TEST_IMAGENET_IMAGE, + help="test image file path or URL", + ) + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, model_cls.get_model_id()) + + model = demo_model_from_cli_args(model_cls, args) + app = ImagenetClassifierApp(model) + print("Model Loaded") + + image = load_image(args.image) + # Run app + probabilities = app.predict(image) + top5 = torch.topk(probabilities, 5) + if not is_test: + labels = load_json(IMAGENET_LABELS_ASSET) + print("Top 5 predictions for image:\n") + for i in range(5): + print(f"{labels[top5.indices[i]]}: {100 * top5.values[i]:.3g}%\n") diff --git a/qai_hub_models/models/_shared/imagenet_classifier/model.py b/qai_hub_models/models/_shared/imagenet_classifier/model.py new file mode 100644 index 00000000..4f743bfe --- /dev/null +++ b/qai_hub_models/models/_shared/imagenet_classifier/model.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from typing import Optional + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.evaluators.classification_evaluator import ClassificationEvaluator +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ASSET_VERSION = 1 +MODEL_ID = __name__.split(".")[-2] +IMAGENET_DIM = 224 + + +class ImagenetClassifier(BaseModel): + """ + Base class for all Imagenet Classifier models within QAI Hub Models. + """ + + def __init__( + self, + net: torch.nn.Module, + ): + """ + Basic initializer which takes in a pretrained classifier network. + Subclasses can choose to implement their own __init__ and forward methods. + """ + super().__init__() + self.net = net + self.eval() + + def forward(self, image_tensor: torch.Tensor): + """ + Predict class probabilities for an input `image`. + + Parameters: + image: A [1, 3, 224, 224] image. + Assumes image has been resized and normalized using the + standard preprocessing method for PyTorch Imagenet models. + + Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + A [1, 1000] where each value is the log-likelihood of + the image belonging to the corresponding Imagenet class. + """ + return self.net(image_tensor) + + def get_evaluator(self) -> BaseEvaluator: + return ClassificationEvaluator() + + def get_input_spec( + self, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm® AI Hub. + """ + return {"image_tensor": ((1, 3, IMAGENET_DIM, IMAGENET_DIM), "float32")} + + @classmethod + def from_pretrained( + cls, + weights: Optional[str] = None, + ) -> "ImagenetClassifier": + net = cls.model_builder(weights=weights or cls.DEFAULT_WEIGHTS) + return cls(net) diff --git a/qai_hub_models/models/_shared/imagenet_classifier/test_utils.py b/qai_hub_models/models/_shared/imagenet_classifier/test_utils.py new file mode 100644 index 00000000..a276ae6f --- /dev/null +++ b/qai_hub_models/models/_shared/imagenet_classifier/test_utils.py @@ -0,0 +1,102 @@ +import pytest +import torch + +from qai_hub_models.models._shared.imagenet_classifier.app import ( + ImagenetClassifierApp, + preprocess_image, +) +from qai_hub_models.models._shared.imagenet_classifier.model import ( + MODEL_ASSET_VERSION, + ImagenetClassifier, +) +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_image, + load_numpy, +) +from qai_hub_models.utils.testing import assert_most_close + +GROUP_NAME = "imagenet_classifier" +TEST_IMAGENET_IMAGE = CachedWebModelAsset.from_asset_store( + GROUP_NAME, MODEL_ASSET_VERSION, "dog.jpg" +) + +# Class "Samoyed" from https://gist.github.com/ageitgey/4e1342c10a71981d0b491e1b8227328b +TEST_IMAGENET_CLASS = 258 + + +@pytest.fixture(scope="module") +def imagenet_sample_torch() -> torch.Tensor: + """ + Returns: + + - Preprocessed (normalized etc) image as torch.Tensor with shape [1, 3, 224, 224] + """ + img = load_image(TEST_IMAGENET_IMAGE, "imagenet_classifier") + return preprocess_image(img) + + +def run_imagenet_classifier_test( + model: ImagenetClassifier, + model_name: str, + asset_version: int = 2, + probability_threshold: float = 0.7, + diff_tol: float = 0.0, + rtol: float = 0.0, + atol: float = 1e-4, +) -> None: + """ + Evaluates the classifier on a test image and validates the output. + + Parameters: + model: The model to evaluate. + model_name: Identifier used to lookup the expected output file. + asset_version: Version of the expected output file to lookup. + probability_threshold: If the predicited probability for the correct class + is below this threshold, the method throws an error. + diff_tol: Float in range [0,1] representing the maximum percentage of + the probabilities that can differ from the ground truth while + still having the test pass. + atol: Absolute tolerance allowed for two numbers to be "close". + rtol: Relative tolerance allowed for two numbers to be "close". + """ + + img = load_image(TEST_IMAGENET_IMAGE) + app = ImagenetClassifierApp(model) + probabilities = app.predict(img) + + expected_output = CachedWebModelAsset.from_asset_store( + model_name, asset_version, "expected_out.npy" + ) + expected_out = load_numpy(expected_output) + assert_most_close(probabilities.numpy(), expected_out, diff_tol, rtol, atol) + + predicted_class = torch.argmax(probabilities, dim=0) + predicted_probability = probabilities[TEST_IMAGENET_CLASS].item() + assert ( + predicted_probability > probability_threshold + ), f"Predicted probability {predicted_probability:.3f} is below the threshold {probability_threshold}." + assert ( + predicted_class == TEST_IMAGENET_CLASS + ), f"Model predicted class {predicted_class} when correct class was {TEST_IMAGENET_CLASS}." + + +def run_imagenet_classifier_trace_test( + model: ImagenetClassifier, + diff_tol: float = 0.005, + rtol: float = 0.0, + atol: float = 1e-4, + is_quantized: bool = False, + check_trace: bool = True, +) -> None: + img = load_image(TEST_IMAGENET_IMAGE) + app = ImagenetClassifierApp(model) + if not is_quantized: + trace_app = ImagenetClassifierApp( + model.convert_to_torchscript(check_trace=check_trace) + ) + else: + trace_app = ImagenetClassifierApp(model.convert_to_quantized_torchscript()) + probabilities = app.predict(img) + trace_probs = trace_app.predict(img) + assert_most_close(probabilities.numpy(), trace_probs.numpy(), diff_tol, rtol, atol) diff --git a/qai_hub_models/models/_shared/mediapipe/__init__.py b/qai_hub_models/models/_shared/mediapipe/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/mediapipe/app.py b/qai_hub_models/models/_shared/mediapipe/app.py new file mode 100644 index 00000000..5187cc92 --- /dev/null +++ b/qai_hub_models/models/_shared/mediapipe/app.py @@ -0,0 +1,676 @@ +from __future__ import annotations + +from typing import Callable, List, Tuple + +import cv2 +import numpy as np +import torch +from PIL.Image import Image + +from qai_hub_models.models._shared.mediapipe.utils import decode_preds_from_anchors +from qai_hub_models.utils.bounding_box_processing import ( + apply_directional_box_offset, + batched_nms, + box_xywh_to_xyxy, + box_xyxy_to_xywh, + compute_box_affine_crop_resize_matrix, + compute_box_corners_with_rotation, +) +from qai_hub_models.utils.draw import ( + draw_box_from_corners, + draw_box_from_xyxy, + draw_connections, + draw_points, +) +from qai_hub_models.utils.image_processing import ( + app_to_net_image_inputs, + apply_affine_to_coordinates, + apply_batched_affines_to_frame, + compute_vector_rotation, + denormalize_coordinates, + numpy_image_to_torch, + resize_pad, +) + + +class MediaPipeApp: + """ + This class consists of "app code" that is required to perform end to end inference with MediaPipe. + + The app uses 2 models: + * MediaPipeDetector + * MediaPipeLandmark + + For a given image input, the app will: + * pre-process the image (convert to range[0, 1]) + * Detect the object and some associated keypoints + * Compute a an approximate region of interest (roi) that encapsulates the entire object. + * Extract that ROI to its own image; rotate it so the object points upwards in the frame. + * Run the landmark detector on the ROI. + * Map the landmark detector output coordinates back to the original input frame. + * if requested, draw the detected object box, ROI, keypoints, and landmarks on the frame. + """ + + def __init__( + self, + detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]], + detector_anchors: torch.Tensor, + landmark_detector: Callable[[torch.Tensor], Tuple[torch.Tensor, ...]], + detector_input_dims: Tuple[int, int], + landmark_input_dims: Tuple[int, int], + keypoint_rotation_vec_start_idx: int, + keypoint_rotation_vec_end_idx: int, + rotation_offset_rads: float, + detect_box_offset_xy: float, + detect_box_scale: float, + min_detector_box_score: float = 0.95, + detector_score_clipping_threshold: int = 100, + nms_iou_threshold: float = 0.3, + min_landmark_score: float = 0.5, + landmark_connections: List[Tuple[int, int]] | None = None, + ): + """ + Create a MediaPipe application. + + Parameters: + detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]] + The bounding box and keypoint detector model. + Input is an image [N C H W], channel layout is BGR, output is [coordinates, scores]. + + detector_anchors: torch.Tensor + Detector anchors, for decoding predictions from anchor points to boxes. + + landmark_detector: Callable[[torch.Tensor], Tuple[torch.Tensor, ...]] + The landmark detector model. Input is an image [N C H W], + channel layout is BGR, output is [scores, landmarks]. + + detector_input_dims: Tuple[int, int] + Input dimensionality (W, H) of the bounding box detector. + + landmark_input_dims: Tuple[int, int] + Input dimensionality (W, H) of the landmark detector. + + keypoint_rotation_vec_start_idx: int + The index of a keypoint (predicted by the bounding box detector). This KP is the start + of the vector used to compute the angle at which the object should be rotated (before + being passed to the landmark detector). + + keypoint_rotation_vec_end_idx: int + The index of a keypoint (predicted by the bounding box detector). This KP is the start + of the vector used to compute the angle at which the object should be rotated (before + being passed to the landmark detector). + + detect_box_offset_xy: float + Move the detected bounding box in the direction of the rotation vector described above by this amount + before passing the box to the landmark detector. + + detect_box_scale: float + Scale the detected bounding box's size by this amount + before passing the box to the landmark detector. + + min_detector_box_score: float + Minimum detector box score for a box to be used for landmark detection. + + detector_score_clipping_threshold: float + Clip detector box scores to [-threshold, threshold] + + nms_iou_threshold: float + IOU threshold for when NMS is run on the detector output boxes. + + min_landmark_score: float + Any landmark set with a score below this number will be discarded. + + landmark_connections: List[Tuple[int, int]] | None + Connections between landmark output points. + Format is List[Tuple[Landmark Point Index 0, Landmark Point Index 1]] + These connections will be drawn on the output image when applicable. + """ + self.detector = detector + self.detector_anchors = detector_anchors + self.landmark_detector = landmark_detector + self.detector_input_dims = detector_input_dims + self.landmark_input_dims = landmark_input_dims + self.keypoint_rotation_vec_start_idx = keypoint_rotation_vec_start_idx + self.keypoint_rotation_vec_end_idx = keypoint_rotation_vec_end_idx + self.rotation_offset_rads = rotation_offset_rads + self.detect_box_offset_xy = detect_box_offset_xy + self.detect_box_scale = detect_box_scale + self.detector_score_clipping_threshold = detector_score_clipping_threshold + self.min_detector_box_score = min_detector_box_score + self.nms_iou_threshold = nms_iou_threshold + self.min_landmark_score = min_landmark_score + self.landmark_connections = landmark_connections + + def predict(self, *args, **kwargs): + # See predict_landmarks_from_image. + return self.predict_landmarks_from_image(*args, **kwargs) + + def predict_landmarks_from_image( + self, + pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], + raw_output: bool = False, + ) -> Tuple[ + List[torch.Tensor | None], + List[torch.Tensor | None], + List[torch.Tensor | None], + List[torch.Tensor | None], + ] | List[np.ndarray]: + """ + From the provided image or tensor, predict the bounding boxes & classes of objects detected within. + + Parameters: + pixel_values_or_image: torch.Tensor + PIL image + or + numpy array (N H W C x uint8) or (H W C x uint8) -- both BGR channel layout + or + pyTorch tensor (N C H W x fp32, value range is [0, 1]), BGR channel layout + + raw_output: bool + See "returns" doc section for details. + + Returns: + If raw_output is false, returns: + images: List[np.ndarray] + A list of predicted images (one for each batch), with NHWC shape and BGR channel layout. + Each image will have landmarks, roi, and bounding boxes drawn, if they are detected. + + Otherwise, returns several "batched" (one element per input image) lists: + batched_selected_boxes: List[torch.Tensor | None] + Selected object bounding box coordinates. None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, 2, 2]. + Layout is + [[box_x1, box_y1], + [box_x2, box_y2]] + + batched_selected_keypoints: List[torch.Tensor | None] + Selected object bounding box keypoints. None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, # of keypoints, 2]. + Layout is + [[keypoint_0_x, keypoint_0_y], + ..., + [keypoint_max_x, keypoint_max_y]] + + batched_roi_4corners: List[torch.Tensor | None] + Selected object "region of interest" (region used as input to the landmark detector) corner coordinates. + None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, 4, 2], where 2 == (x, y) + The order of points is (top left point, bottom left point, top right point, bottom right point) + + batched_selected_landmarks: List[torch.tensor | None] + Selected landmarks. Organized like the following: + [ + # Batch 0 (for Input Image 0) + torch.Tensor([ + Selected Landmark 1 w/ shape (# of landmark points, 3) + Selected Landmark 2 w/ shape (# of landmark points, 3) + ... + ]), + # Batch 1 (for Input Image 1) + None # (this image has no detected object) + ... + ] + The shape of each inner list element is [# of landmark points, 3], + where 3 == (X, Y, Conf) + + ... (additional outputs if necessary) + """ + # Input Prep + NHWC_int_numpy_frames, NCHW_fp32_torch_frames = app_to_net_image_inputs( + pixel_values_or_image + ) + + # Run Bounding Box & Keypoint Detector + batched_selected_boxes, batched_selected_keypoints = self._run_box_detector( + NCHW_fp32_torch_frames + ) + + # The region of interest ( bounding box of 4 (x, y) corners). + # List[torch.Tensor(shape=[Num Boxes, 4, 2])], + # where 2 == (x, y) + # + # A list element will be None if there is no selected ROI. + batched_roi_4corners = self._compute_object_roi( + batched_selected_boxes, batched_selected_keypoints + ) + + # selected landmarks for the ROI (if any) + # List[torch.Tensor(shape=[Num Selected Landmarks, K, 3])], + # where K == number of landmark keypoints, 3 == (x, y, p) + # + # A list element will be None if there is no ROI. + landmarks_out = self._run_landmark_detector( + NHWC_int_numpy_frames, batched_roi_4corners + ) + + if raw_output: + return ( + batched_selected_boxes, + batched_selected_keypoints, + batched_roi_4corners, + *landmarks_out, + ) + + self._draw_predictions( + NHWC_int_numpy_frames, + batched_selected_boxes, + batched_selected_keypoints, + batched_roi_4corners, + *landmarks_out, + ) + + return NHWC_int_numpy_frames + + def _run_box_detector( + self, NCHW_fp32_torch_frames: torch.Tensor + ) -> Tuple[List[torch.Tensor | None], List[torch.Tensor | None]]: + """ + From the provided image or tensor, predict the bounding boxes and keypoints of objects detected within. + + Parameters: + NCHW_fp32_torch_frames: torch.Tensor + pyTorch tensor (N C H W x fp32, value range is [0, 1]), BGR channel layout + + Returns: + batched_selected_boxes: List[torch.Tensor | None] + Selected object bounding box coordinates. None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, 2, 2]. + Layout is + [[box_x1, box_y1], + [box_x2, box_y2]] + + batched_selected_keypoints: List[torch.Tensor | None] + Selected object bounding box keypoints. None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, # of keypoints, 2]. + Layout is + [[keypoint_0_x, keypoint_0_y], + ..., + [keypoint_max_x, keypoint_max_y]] + """ + + # Resize input frames such that they're the appropriate size for detector inference. + box_detector_net_inputs, pd_net_input_scale, pd_net_input_pad = resize_pad( + NCHW_fp32_torch_frames, self.detector_input_dims + ) + + # Run object detector. + # Outputs: + # - box_coords: , where N == # of anchors & C == # of of coordinates + # Layout of C is (box_cx, boc_cw, box_w, box_h, keypoint_0_x, keypoint_0_y, ..., keypoint_maxKey_x, keypoint_maxKey_y) + # - box_scores: , where N == # of anchors. + box_coords, box_scores = self.detector(box_detector_net_inputs) + box_scores = box_scores.clamp( + -self.detector_score_clipping_threshold, + self.detector_score_clipping_threshold, + ) + box_scores = box_scores.sigmoid().squeeze(dim=-1) + + # Reshape outputs so that they have shape [..., # of coordinates, 2], where 2 == (x, y) + box_coords = box_coords.view(list(box_coords.shape)[:-1] + [-1, 2]) + anchors = self.detector_anchors.view( + list(self.detector_anchors.shape)[:-1] + [-1, 2] + ) + + # Decode to output coordinates using the model's trained anchors. + decode_preds_from_anchors(box_coords, self.detector_input_dims, anchors) + + # Convert box coordinates from CWH -> XYXY format for NMS. + box_coords[:2] = box_xywh_to_xyxy(box_coords[:2]) + + # flatten coords (remove final [2] dim) for NMS + flattened_box_coords = box_coords.view(list(box_coords.shape)[:-2] + [-1]) + + # Run non maximum suppression on the output + # batched_selected_coords = List[torch.Tensor(shape=[Num Boxes, 4])], + # where 4 = (x0, y0, x1, y1) + batched_selected_coords, _ = batched_nms( + self.nms_iou_threshold, + self.min_detector_box_score, + flattened_box_coords, + box_scores, + ) + + selected_boxes = [] + selected_keypoints = [] + for i in range(0, len(batched_selected_coords)): + selected_coords = batched_selected_coords[i] + if len(selected_coords) != 0: + # Reshape outputs again so that they have shape [..., # of boxes, 2], where 2 == (x, y) + selected_coords = batched_selected_coords[i].view( + list(batched_selected_coords[i].shape)[:-1] + [-1, 2] + ) + + denormalize_coordinates( + selected_coords, + self.detector_input_dims, + pd_net_input_scale, + pd_net_input_pad, + ) + + selected_boxes.append(selected_coords[:, :2]) + selected_keypoints.append(selected_coords[:, 2:]) + else: + selected_boxes.append(None) + selected_keypoints.append(None) + + return selected_boxes, selected_keypoints + + def _compute_object_roi( + self, + batched_selected_boxes: List[torch.Tensor | None], + batched_selected_keypoints: List[torch.Tensor | None], + ) -> List[torch.Tensor | None]: + """ + From the provided bounding boxes and keypoints, compute the region of interest (ROI) that should be used + as input to the landmark detection model. + + Parameters: + batched_selected_boxes: List[torch.Tensor | None] + Selected object bounding box coordinates. None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, 2, 2]. + Layout is + [[box_x1, box_y1], + [box_x2, box_y2]] + + batched_selected_keypoints: List[torch.Tensor | None] + Selected object bounding box keypoints. None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, # of keypoints, 2]. + Layout is + [[keypoint_0_x, keypoint_0_y], + ..., + [keypoint_max_x, keypoint_max_y]] + + Returns + batched_roi_4corners: List[torch.Tensor | None] + Selected object "region of interest" (region used as input to the landmark detector) corner coordinates. + None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, 4, 2], where 2 == (x, y) + The order of points is (top left point, bottom left point, top right point, bottom right point) + """ + batched_selected_roi = [] + for boxes, keypoints in zip(batched_selected_boxes, batched_selected_keypoints): + if boxes is None or keypoints is None: + batched_selected_roi.append(None) + continue + + # Compute bounding box center and rotation + theta = compute_vector_rotation( + keypoints[:, self.keypoint_rotation_vec_start_idx, ...], + keypoints[:, self.keypoint_rotation_vec_end_idx, ...], + self.rotation_offset_rads, + ) + selected_boxes_cwh = box_xyxy_to_xywh(boxes) + xc = selected_boxes_cwh[..., 0, 0] + yc = selected_boxes_cwh[..., 0, 1] + w = selected_boxes_cwh[..., 1, 0] + h = selected_boxes_cwh[..., 1, 1] + + # The bounding box often misses the entire object. + # Move the bounding box slightly (if necessary) to center it with the object. + apply_directional_box_offset( + self.detect_box_offset_xy * w, + keypoints[..., self.keypoint_rotation_vec_start_idx, :], + keypoints[..., self.keypoint_rotation_vec_end_idx, :], + xc, + yc, + ) + + # Apply scaling to enlargen the bounding box + w *= self.detect_box_scale + h *= self.detect_box_scale + + # Compute box corners from box center, width, height + batched_selected_roi.append( + compute_box_corners_with_rotation(xc, yc, w, h, theta) + ) + + return batched_selected_roi + + def _run_landmark_detector( + self, + NHWC_int_numpy_frames: List[np.ndarray], + batched_roi_4corners: List[torch.Tensor | None], + ) -> Tuple[List[torch.Tensor | None]]: + """ + From the provided image or tensor, predict the bounding boxes & classes of objects detected within. + + Parameters: + NHWC_int_numpy_frames: + List of numpy arrays of shape (H W C x uint8) -- BGR channel layout + Length of list is # of batches (the number of input images) + + batched_roi_4corners: List[torch.Tensor | None] + Selected object "region of interest" (region used as input to the landmark detector) corner coordinates. + None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, 4, 2], where 2 == (x, y) + The order of points is (top left point, bottom left point, top right point, bottom right point) + + Returns: + batched_selected_landmarks: List[torch.tensor | None] + Selected landmarks. Organized like the following: + [ + # Batch 0 (for Input Image 0) + torch.Tensor([ + Selected Landmark 1 w/ shape (# of landmark points, 3) + Selected Landmark 2 w/ shape (# of landmark points, 3) + ... + ]), + # Batch 1 (for Input Image 1) + None # (this image has no detected object) + ... + ] + The shape of each inner list element is [# of landmark points, 3], + where 3 == (X, Y, Conf) + + ... (additional outputs when needed by implementation) + """ + + # selected landmarks for the ROI (if any) + # List[torch.Tensor(shape=[Num Selected Landmarks, K, 3])], + # where K == number of landmark keypoints, 3 == (x, y, p) + # + # A list element will be None if there is no ROI. + batched_selected_landmarks: List[torch.Tensor | None] = [] + + # For each input image... + for batch_idx, roi_4corners in enumerate(batched_roi_4corners): + if roi_4corners is None: + continue + affines = compute_box_affine_crop_resize_matrix( + roi_4corners[:, :3], self.landmark_input_dims + ) + + # Create input images by applying the affine transforms. + keypoint_net_inputs = numpy_image_to_torch( + apply_batched_affines_to_frame( + NHWC_int_numpy_frames[batch_idx], affines, self.landmark_input_dims + ) + ) + + # Compute landmarks. + ld_scores, landmarks = self.landmark_detector( # type: ignore + keypoint_net_inputs + ) + + # Convert [0-1] ranged values of landmarks to integer pixel space. + landmarks[:, :, 0] *= self.landmark_input_dims[0] + landmarks[:, :, 1] *= self.landmark_input_dims[1] + + # 1 landmark is predicted for each ROI of each input image. + # For each region of interest & associated predicted landmarks... + all_landmarks = [] + for ld_batch_idx in range(landmarks.shape[0]): + # Exclude landmarks that don't meet the appropriate score threshold. + if ld_scores[ld_batch_idx] >= self.min_detector_box_score: + # Apply the inverse of affine transform used above to the landmark coordinates. + # This will convert the coordinates to their locations in the original input image. + inverted_affine = torch.from_numpy( + cv2.invertAffineTransform(affines[ld_batch_idx]) + ).float() + landmarks[ld_batch_idx][:, :2] = apply_affine_to_coordinates( + landmarks[ld_batch_idx][:, :2], inverted_affine + ) + + # Add the predicted landmarks to our list. + all_landmarks.append(landmarks[ld_batch_idx]) + + # Add this batch of landmarks to the output list. + batched_selected_landmarks.append( + torch.stack(all_landmarks, dim=0) if all_landmarks else None + ) + else: + # Add None for these lists, since this batch has no predicted bounding boxes. + batched_selected_landmarks.append(None) + + return (batched_selected_landmarks,) + + def _draw_box_and_roi( + self, + NHWC_int_numpy_frame: np.ndarray, + selected_boxes: torch.Tensor, + selected_keypoints: torch.Tensor, + roi_4corners: torch.Tensor, + ): + """ + Draw bounding box, keypoints, and corresponding region of interest (ROI) on the provided frame + + Parameters: + NHWC_int_numpy_frame: + Numpy array of shape (H W C x uint8) -- BGR channel layout + + selected_boxes: torch.Tensor + Selected object bounding box coordinates. Shape is [num_selected_boxes, 2, 2]. + Layout is + [[box_x1, box_y1], + [box_x2, box_y2]] + + selected_keypoints: List[torch.Tensor | None] + Selected object bounding box keypoints. Shape is [num_selected_boxes, # of keypoints, 2]. + Layout is + [[keypoint_0_x, keypoint_0_y], + ..., + [keypoint_max_x, keypoint_max_y]] + + roi_4corners: List[torch.Tensor | None] + Selected object "region of interest" (region used as input to the landmark detector) corner coordinates. + Shape is [num_selected_boxes, 4, 2], where 2 == (x, y) + + Returns + Nothing; drawing is done on input frame. + """ + for roi, box, kp in zip(roi_4corners, selected_boxes, selected_keypoints): + # Draw detector bounding box + draw_box_from_xyxy(NHWC_int_numpy_frame, box[0], box[1], (255, 0, 0), 1) + # Draw detector keypoints + draw_points(NHWC_int_numpy_frame, kp) + # Draw region of interest box computed from the detector box & keypoints + # (this is the input to the landmark detector) + draw_box_from_corners(NHWC_int_numpy_frame, roi, (0, 255, 0)) + + def _draw_landmarks( + self, + NHWC_int_numpy_frame: np.ndarray, + selected_landmarks: torch.Tensor, + **kwargs, + ): + """ + Draw landmarks on the provided frame + + Parameters: + NHWC_int_numpy_frame: + Numpy array of shape (H W C x uint8) -- BGR channel layout + + selected_landmarks + Selected landmarks. Organized like the following: + torch.Tensor([ + Selected Landmark 1 w/ shape (# of landmark points, 3) + Selected Landmark 2 w/ shape (# of landmark points, 3) + ... + ]), + The shape of each inner list element is [# of landmark points, 3], + where 3 == (X, Y, Conf) + + Returns + Nothing; drawing is done on input frame. + """ + for ldm in selected_landmarks: + # Draw landmark points + draw_points(NHWC_int_numpy_frame, ldm[:, :2], (0, 255, 0)) + # Draw connections between landmark points + if self.landmark_connections: + draw_connections( + NHWC_int_numpy_frame, + ldm[:, :2], + self.landmark_connections, + (255, 0, 0), + 2, + ) + + def _draw_predictions( + self, + NHWC_int_numpy_frames: List[np.ndarray], + batched_selected_boxes: List[torch.Tensor | None], + batched_selected_keypoints: List[torch.Tensor | None], + batched_roi_4corners: List[torch.Tensor | None], + batched_selected_landmarks: List[torch.Tensor | None], + **kwargs, + ): + """ + Draw predictions on the provided frame + + Parameters: + NHWC_int_numpy_frames: + List of numpy arrays of shape (H W C x uint8) -- BGR channel layout + Length of list is # of batches (the number of input images) + + batched_selected_boxes: List[torch.Tensor | None] + Selected object bounding box coordinates. None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, 2, 2]. + Layout is + [[box_x1, box_y1], + [box_x2, box_y2]] + + batched_selected_keypoints: List[torch.Tensor | None] + Selected object bounding box keypoints. None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, # of keypoints, 2]. + Layout is + [[keypoint_0_x, keypoint_0_y], + ..., + [keypoint_max_x, keypoint_max_y]] + + batched_roi_4corners: List[torch.Tensor | None] + Selected object "region of interest" (region used as input to the landmark detector) corner coordinates. + None if batch had no bounding boxes with a score above the threshold. + Shape of each list element is [num_selected_boxes, 4, 2], where 2 == (x, y) + The order of points is (top left point, bottom left point, top right point, bottom right point) + + batched_selected_landmarks: List[torch.tensor | None] + Selected landmarks. Organized like the following: + [ + # Batch 0 (for Input Image 0) + torch.Tensor([ + Selected Landmark 1 w/ shape (# of landmark points, 3) + Selected Landmark 2 w/ shape (# of landmark points, 3) + ... + ]), + # Batch 1 (for Input Image 1) + None # (this image has no detected object) + ... + ] + The shape of each inner list element is [# of landmark points, 3], + where 3 == (X, Y, Conf) + + Returns + Nothing; drawing is done on input frame + """ + for batch_idx in range(len(NHWC_int_numpy_frames)): + image = NHWC_int_numpy_frames[batch_idx] + ld = batched_selected_landmarks[batch_idx] + box = batched_selected_boxes[batch_idx] + kp = batched_selected_keypoints[batch_idx] + roi_4corners = batched_roi_4corners[batch_idx] + + if box is not None and kp is not None and roi_4corners is not None: + self._draw_box_and_roi(image, box, kp, roi_4corners) + if ld is not None: + self._draw_landmarks(image, ld) diff --git a/qai_hub_models/models/_shared/mediapipe/utils.py b/qai_hub_models/models/_shared/mediapipe/utils.py new file mode 100644 index 00000000..21e6c6bf --- /dev/null +++ b/qai_hub_models/models/_shared/mediapipe/utils.py @@ -0,0 +1,108 @@ +from functools import partial +from typing import Any, Tuple + +import torch + +from qai_hub_models.utils.asset_loaders import SourceAsRoot +from qai_hub_models.utils.input_spec import InputSpec + +# ContextManager for running code with MediaPipePyTorch in python path and the +# root directory of MediaPipePyTorch set as cwd +MediaPipePyTorchAsRoot = partial( + SourceAsRoot, + "https://github.com/zmurez/MediaPipePyTorch", + "65f2549ba35cd61dfd29f402f6c21882a32fabb1", + "mediapipe_pytorch", + 1, +) + + +def trace_mediapipe( + detector_input_spec: InputSpec, + box_detector: torch.nn.Module, + landmark_input_spec: InputSpec, + landmark_detector: torch.nn.Module, +) -> Tuple[Any, Any]: + # Convert the models to pytorch traces. Traces can be saved & loaded from disk. + # With Qualcomm® AI Hub, a pytorch trace can be exported to run efficiently on mobile devices! + # + # Returns: Tuple[Box Detector Trace Object, Landmark Detector Trace Object] + # + box_detector_input_shape = detector_input_spec["image"][0] + box_detector_trace = torch.jit.trace( + box_detector, [torch.rand(box_detector_input_shape)] + ) + + landmark_detector_input_shape = landmark_input_spec["image"][0] + landmark_detector_trace = torch.jit.trace( + landmark_detector, [torch.rand(landmark_detector_input_shape)] + ) + + return box_detector_trace, landmark_detector_trace + + +def decode_preds_from_anchors( + box_coords: torch.Tensor, img_size: Tuple[int, int], anchors: torch.Tensor +): + """ + Decode predictions using the provided anchors. + + This function can be exported and run inside inference frameworks if desired. + + Note: If included in the model, this code is likely to be unfriendly to quantization. + This is because of the high range and variability of the output tensor. + + For best quantization accuracy, this code should be run separately from the model, + or the model should de-quantize activations before running these layers. + + Inputs: + box_coords: torch.Tensor + coordinates. Range must be [0, 1]. Shape is [Batch, Num Anchors, 2, 2] + where [2, 2] == [[xcenter, ycenter], [w, h]] + + img_size: Tuple(int, int) + The size of the tensor that was fed to the NETWORK (NOT the original image size). + H / W is the same order as coordinates. + + anchors: float + box anchors. Range must be [0, 1]. Shape is [Batch, Num Anchors, 2, 2], + where [2, 2] == [[xcenter, ycenter], [w, h]] + + pad: Tuple(int, int) + Padding used during resizing of input image to network input tensor. (w, h) + This is the absolute # of padding pixels in the network input tensor, NOT in the original image. + + Outputs: + coordinates: [..., m] tensor, where m is always (x0, y0) + The absolute coordinates of the box in the original image. + The "coordinates" input is modified in place. + """ + assert box_coords.shape[-1] == anchors.shape[-1] == 2 + assert box_coords.shape[-3] == anchors.shape[-3] + + w_size, h_size = img_size + anchors_x, anchors_y, anchors_w, anchors_h = ( + anchors[..., 0, 0], + anchors[..., 0, 1], + anchors[..., 1, 0], + anchors[..., 1, 1], + ) + expanded_anchors_shape = list(anchors_w.shape) + [1] + + # Determine real center X and Y, as well as real pixel W and H + box_coords[..., 0, 0] = ( + box_coords[..., 0, 0] / w_size * anchors_w + anchors_x + ) # x_center + box_coords[..., 0, 1] = ( + box_coords[..., 0, 1] / h_size * anchors_h + anchors_y + ) # y_center + box_coords[..., 1, 0] = box_coords[..., 1, 0] / w_size * anchors_w # w + box_coords[..., 1, 1] = box_coords[..., 1, 1] / h_size * anchors_h # h + + # Get X and Y values of keypoints + box_coords[..., 2:, 0] = box_coords[..., 2:, 0] / w_size * anchors_w.view( + expanded_anchors_shape + ) + anchors_x.view(expanded_anchors_shape) + box_coords[..., 2:, 1] = box_coords[..., 2:, 1] / h_size * anchors_h.view( + expanded_anchors_shape + ) + anchors_y.view(expanded_anchors_shape) diff --git a/qai_hub_models/models/_shared/quicksrnet/__init__.py b/qai_hub_models/models/_shared/quicksrnet/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/quicksrnet/common.py b/qai_hub_models/models/_shared/quicksrnet/common.py new file mode 100644 index 00000000..b37e4ebd --- /dev/null +++ b/qai_hub_models/models/_shared/quicksrnet/common.py @@ -0,0 +1,42 @@ +import torch + +from qai_hub_models.utils.asset_loaders import SourceAsRoot + +QUICKSRNET_SOURCE_REPOSITORY = "https://github.com/quic/aimet-model-zoo" +QUICKSRNET_SOURCE_REPO_COMMIT = "d09d2b0404d10f71a7640a87e9d5e5257b028802" + + +def _load_quicksrnet_source_model( + model_id, + model_asset_version, + scaling_factor, + num_channels, + num_intermediate_layers, + use_ito_connection, +) -> torch.nn.Module: + # Load QuickSRNet model from the source repository using the given weights. + # Returns .utils.super_resolution.models.QuickSRNetBase + with SourceAsRoot( + QUICKSRNET_SOURCE_REPOSITORY, + QUICKSRNET_SOURCE_REPO_COMMIT, + model_id, + model_asset_version, + ): + # Remove import of model_definition.py as it has an import error itself, + # but we don't need anything from that file here + with open("aimet_zoo_torch/quicksrnet/__init__.py", "r") as file: + file_content = file.read() + new_content = file_content.replace( + "from .model.model_definition import QuickSRNet", " " + ) + with open("aimet_zoo_torch/quicksrnet/__init__.py", "w") as file: + file.write(new_content) + + from aimet_zoo_torch.quicksrnet.model.models import QuickSRNetBase + + return QuickSRNetBase( + scaling_factor=scaling_factor, + num_channels=num_channels, + num_intermediate_layers=num_intermediate_layers, + use_ito_connection=use_ito_connection, + ) diff --git a/qai_hub_models/models/_shared/repaint/__init__.py b/qai_hub_models/models/_shared/repaint/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/repaint/app.py b/qai_hub_models/models/_shared/repaint/app.py new file mode 100644 index 00000000..9cc9f2d8 --- /dev/null +++ b/qai_hub_models/models/_shared/repaint/app.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from typing import Callable, List + +import numpy as np +import torch +from PIL.Image import Image + +from qai_hub_models.utils.image_processing import ( + app_to_net_image_inputs, + torch_tensor_to_PIL_image, +) + + +class RepaintMaskApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with AOTGAN. + + The app uses 1 model: + * AOTGAN + + For a given image input, the app will: + * pre-process the image + * Run AOTGAN inference + * Convert the output tensor into a PIL Image + """ + + def __init__(self, model: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]): + self.model = model + + def predict(self, *args, **kwargs): + # See paint_mask_on_image. + return self.paint_mask_on_image(*args, **kwargs) + + def paint_mask_on_image( + self, + pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], + mask_pixel_values_or_image: torch.Tensor | np.ndarray | Image, + ) -> List[Image]: + """ + Erases and repaints the source image[s] in the pixel values given by the mask. + + Parameters: + pixel_values_or_image + PIL image(s) + or + numpy array (N H W C x uint8) or (H W C x uint8) -- both RGB channel layout + or + pyTorch tensor (N C H W x fp32, value range is [0, 1]), RGB channel layout + + mask_pixel_values_or_image + PIL image(s) + or + numpy array (N H W C x uint8) or (H W C x uint8) -- both RGB channel layout + or + pyTorch tensor (N C H W x fp32, value range is [0, 1]), RGB channel layout + + If one mask is provided, it will be used for every input image. + + Returns: + images: List[PIL.Image] + A list of predicted images (one list element per batch). + """ + NCHW_fp32_torch_frames = app_to_net_image_inputs(pixel_values_or_image)[1] + NCHW_fp32_torch_masks = app_to_net_image_inputs(mask_pixel_values_or_image)[1] + + # The number of input images should equal the number of input masks. + if NCHW_fp32_torch_masks.shape[0] != 1: + NCHW_fp32_torch_masks = NCHW_fp32_torch_masks.tile( + (NCHW_fp32_torch_frames.shape[0], 1, 1, 1) + ) + + # Mask input image + image_masked = ( + NCHW_fp32_torch_frames * (1 - NCHW_fp32_torch_masks) + NCHW_fp32_torch_masks + ) + out = self.model(image_masked, NCHW_fp32_torch_masks) + + return [torch_tensor_to_PIL_image(img) for img in out] diff --git a/qai_hub_models/models/_shared/repaint/demo.py b/qai_hub_models/models/_shared/repaint/demo.py new file mode 100644 index 00000000..94ca0d21 --- /dev/null +++ b/qai_hub_models/models/_shared/repaint/demo.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import Type + +from qai_hub_models.models._shared.repaint.app import RepaintMaskApp +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_image +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime +from qai_hub_models.utils.display import display_or_save_image + + +# Run repaint app end-to-end on a sample image. +# The demo will display the predicted image in a window. +def repaint_demo( + model_type: Type[BaseModel], + default_image: str | CachedWebAsset, + default_mask: str | CachedWebAsset, + is_test: bool = False, +): + # Demo parameters + parser = get_model_cli_parser(model_type) + parser = get_on_device_demo_parser( + parser, available_target_runtimes=[TargetRuntime.TFLITE], add_output_dir=True + ) + parser.add_argument( + "--image", + type=str, + default=default_image, + help="test image file path or URL", + ) + parser.add_argument( + "--mask", + type=str, + default=default_mask, + help="test mask file path or URL", + ) + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, model_type.get_model_id()) + + # Load image & model + model = demo_model_from_cli_args(model_type, args) + image = load_image(args.image) + mask = load_image(args.mask) + print("Model Loaded") + + # Run app + app = RepaintMaskApp(model) + out = app.paint_mask_on_image(image, mask)[0] + + if not is_test: + display_or_save_image(image, args.output_dir, "input_image.png", "input image") + display_or_save_image(out, args.output_dir, "output_image.png", "output image") diff --git a/qai_hub_models/models/_shared/sesr/__init__.py b/qai_hub_models/models/_shared/sesr/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/sesr/common.py b/qai_hub_models/models/_shared/sesr/common.py new file mode 100644 index 00000000..3896057c --- /dev/null +++ b/qai_hub_models/models/_shared/sesr/common.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.utils.asset_loaders import SourceAsRoot + +# SESR original repo is here: https://github.com/ARM-software/sesr +# But this is all written in TF and Keras. Torch version is in AIMET +SESR_SOURCE_REPOSITORY = "https://github.com/quic/aimet-model-zoo" +SESR_SOURCE_REPO_COMMIT = "d09d2b0404d10f71a7640a87e9d5e5257b028802" + + +def _load_sesr_source_model( + model_id, model_asset_version: int | str, scaling_factor, num_channels, num_lblocks +) -> torch.nn.Module: + # Load SESR model from the source repository using the given weights. + # Returns .utils.super_resolution.models.SESRRelease + with SourceAsRoot( + SESR_SOURCE_REPOSITORY, SESR_SOURCE_REPO_COMMIT, model_id, model_asset_version + ): + + from aimet_zoo_torch.common.super_resolution.models import SESRRelease + + return SESRRelease( + scaling_factor=scaling_factor, + num_channels=num_channels, + num_lblocks=num_lblocks, + ) diff --git a/qai_hub_models/models/_shared/super_resolution/__init__.py b/qai_hub_models/models/_shared/super_resolution/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/super_resolution/app.py b/qai_hub_models/models/_shared/super_resolution/app.py new file mode 100644 index 00000000..8c018561 --- /dev/null +++ b/qai_hub_models/models/_shared/super_resolution/app.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from typing import Callable, List + +import torch +import torch.nn.functional as F +from PIL.Image import Image + +from qai_hub_models.utils.image_processing import ( + app_to_net_image_inputs, + torch_tensor_to_PIL_image, +) + +PRE_PAD = 10 +SCALE = 4 + + +class SuperResolutionApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with Super Resolution models. + + The app uses 1 model: + * SuperResolution models + + For a given image input, the app will: + * pre-process the image (convert to range[0, 1]) + * Run inference + * post-process the image + * display the input and output side-by-side + """ + + def __init__(self, model: Callable[[torch.Tensor], torch.Tensor]): + self.model = model + + def predict(self, *args, **kwargs): + # See upscale_image. + return self.upscale_image(*args, **kwargs) + + def upscale_image( + self, + pixel_values_or_image: torch.Tensor | Image | List[Image], + ) -> List[Image]: + """ + Upscale provided images + + Parameters: + pixel_values_or_image + PIL image(s) + or + numpy array (N H W C x uint8) or (H W C x uint8) -- both RGB channel layout + or + pyTorch tensor (N C H W x fp32, value range is [0, 1]), RGB channel layout + + Returns: + images: List[PIL.Image.Image] + A list of upscaled images (one for each input image). + """ + _, NCHW_fp32_torch_frames = app_to_net_image_inputs(pixel_values_or_image) + + # pre-pad with a value of 10 + NCHW_fp32_torch_frames = F.pad( + NCHW_fp32_torch_frames, (0, PRE_PAD, 0, PRE_PAD), "reflect" + ) + + # Run prediction + upscaled_images = self.model(NCHW_fp32_torch_frames) + if len(upscaled_images.shape) == 3: + upscaled_images = torch.unsqueeze(upscaled_images, 0) + + # Postprocess -- Remove padding + # preprocessing used a pre_pad value of 10 + # These weights use a scale of 4 + _, _, h, w = upscaled_images.shape + upscaled_images = upscaled_images[ + :, :, 0 : h - PRE_PAD * SCALE, 0 : w - PRE_PAD * SCALE + ] + + return [torch_tensor_to_PIL_image(img) for img in upscaled_images] diff --git a/qai_hub_models/models/_shared/super_resolution/demo.py b/qai_hub_models/models/_shared/super_resolution/demo.py new file mode 100644 index 00000000..629973d6 --- /dev/null +++ b/qai_hub_models/models/_shared/super_resolution/demo.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from typing import Type + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_image +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.display import display_or_save_image + + +# Run Super Resolution end-to-end on a sample image. +# The demo will display both the input image and the higher resolution output. +def super_resolution_demo( + model_cls: Type[BaseModel], + default_image: str | CachedWebAsset, + is_test: bool = False, +): + # Demo parameters + parser = get_model_cli_parser(model_cls) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=default_image, + help="image file path or URL.", + ) + + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, model_cls.get_model_id()) + + # Load image & model + model = demo_model_from_cli_args(model_cls, args) + app = SuperResolutionApp(model) + print("Model Loaded") + image = load_image(args.image) + pred_images = app.upscale_image(image) + if not is_test: + display_or_save_image( + image, args.output_dir, "original_image.png", "original image" + ) + display_or_save_image( + pred_images[0], args.output_dir, "upscaled_image.png", "upscaled image" + ) diff --git a/qai_hub_models/models/_shared/swin/__init__.py b/qai_hub_models/models/_shared/swin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/swin/swin_transformer.py b/qai_hub_models/models/_shared/swin/swin_transformer.py new file mode 100644 index 00000000..4f35c633 --- /dev/null +++ b/qai_hub_models/models/_shared/swin/swin_transformer.py @@ -0,0 +1,258 @@ +from __future__ import annotations + +import math +from typing import List, Optional + +import torch +from torch import Tensor +from torch.nn import functional as F +from torchvision.models.swin_transformer import ShiftedWindowAttention + + +def split_linear_input(x, weight, bias, max_channel): + num_chunks = int(-(-x.size(-1) // max_channel)) # Ceiling division + if num_chunks == 1: + return F.linear(x, weight, bias) + x_chunks = x.chunk(num_chunks, dim=-1) + weight_chunks = weight.chunk(num_chunks, dim=1) + output = sum( + [ + F.linear(x_chunk, weight_chunk) + for x_chunk, weight_chunk in zip(x_chunks, weight_chunks) + ] + ) + if bias is not None: + output += bias + return output + + +def split_linear(x, weight, bias, max_channel=512): + """ + Split linear input and output channels to have no more than `max_channel` + """ + num_chunks = int(-(-weight.size(0) // max_channel)) # Ceiling division + if num_chunks == 1: + return split_linear_input(x, weight, bias, max_channel) + weight_chunks = weight.chunk(num_chunks, dim=0) + bias_chunks = bias.chunk(num_chunks) if bias is not None else [None] * num_chunks + # Apply F.linear separately and concatenate the outputs + output = torch.cat( + [ + split_linear_input(x, weight_chunk, bias_chunk, max_channel) + for weight_chunk, bias_chunk in zip(weight_chunks, bias_chunks) + ], + dim=-1, + ) + return output + + +class ShiftedWindowAttentionInf(torch.nn.Module): + def __init__(self, model: ShiftedWindowAttention): + """ + Optimize for inference. See `shifted_window_attention_inf` for details. + + Note: We do not monkey patch + `torchvision.models.swin_transformer.shifted_window_attention` so that we can + test numerical parity between ShiftedWindowAttentionInf and + ShiftedWindowAttention + """ + super().__init__() + self.model = model + + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor with layout of [B, H, W, C] + Returns: + Tensor with same layout as input, i.e. [B, H, W, C] + """ + relative_position_bias = self.model.get_relative_position_bias() + return shifted_window_attention_inf( + x, + self.model.qkv.weight, + self.model.proj.weight, + relative_position_bias, + self.model.window_size, + self.model.num_heads, + shift_size=self.model.shift_size, + attention_dropout=self.model.attention_dropout, + dropout=self.model.dropout, + qkv_bias=self.model.qkv.bias, + proj_bias=self.model.proj.bias, + training=self.model.training, + ) + + +# Overrides for SwinTranformer model +# Alternative to https://github.com/pytorch/vision/blob/0d75d9e5516f446c9c0ef93bd4ed9fea13992d06/torchvision/models/swin_transformer.py#L116 +# fixes view from rank-6 to rank-5 for SwinTransformer +def shifted_window_attention_inf( + input: Tensor, + qkv_weight: Tensor, + proj_weight: Tensor, + relative_position_bias: Tensor, + window_size: List[int], + num_heads: int, + shift_size: List[int], + attention_dropout: float = 0.0, + dropout: float = 0.0, + qkv_bias: Optional[Tensor] = None, + proj_bias: Optional[Tensor] = None, + logit_scale: Optional[Tensor] = None, + training: bool = True, +) -> Tensor: + """ + Updated from + https://github.com/pytorch/vision/blob/0d75d9e5516f446c9c0ef93bd4ed9fea13992d06/torchvision/models/swin_transformer.py#L116 + """ + B, H, W, C = input.shape + # pad feature maps to multiples of window size + pad_r = (window_size[1] - W % window_size[1]) % window_size[1] + pad_b = (window_size[0] - H % window_size[0]) % window_size[0] + x = input + if pad_r != 0 or pad_b != 0: + x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b)) + _, pad_H, pad_W, _ = x.shape + + shift_size = shift_size.copy() + # If window size is larger than feature size, there is no need to shift window + if window_size[0] >= pad_H: + shift_size[0] = 0 + if window_size[1] >= pad_W: + shift_size[1] = 0 + + # cyclic shift + if sum(shift_size) > 0: + x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1]), dims=(1, 2)) + + # partition windows + num_windows = (pad_H // window_size[0]) * (pad_W // window_size[1]) + + # Local change begin + x = x.view( + B * pad_H // window_size[0], + window_size[0], + pad_W // window_size[1], + window_size[1] * C, + ) + + x = x.permute(0, 2, 1, 3).reshape( + B * num_windows, window_size[0] * window_size[1], C + ) # B*nW, Ws*Ws, C + # Local change end + + # multi-head attention + if logit_scale is not None and qkv_bias is not None: + qkv_bias = qkv_bias.clone() + length = qkv_bias.numel() // 3 + qkv_bias[length : 2 * length].zero_() + # === Local change begin === + # Split qkv projection + q_weight, k_weight, v_weight = torch.split( + qkv_weight, qkv_weight.shape[0] // 3, dim=0 + ) + q_bias, k_bias, v_bias = torch.split(qkv_bias, qkv_bias.shape[0] // 3, dim=0) + if q_weight.shape[0] > 512: + # Improve GPU residency with smaller fully connected layers + q = split_linear(x, q_weight, q_bias) + k = split_linear(x, k_weight, k_bias) + v = split_linear(x, v_weight, v_bias) + else: + q = F.linear(x, q_weight, q_bias) + k = F.linear(x, k_weight, k_bias) + v = F.linear(x, v_weight, v_bias) + + q = q.reshape(x.size(0), x.size(1), num_heads, C // num_heads).permute(0, 2, 1, 3) + k = k.reshape(x.size(0), x.size(1), num_heads, C // num_heads).permute(0, 2, 1, 3) + v = v.reshape(x.size(0), x.size(1), num_heads, C // num_heads).permute(0, 2, 1, 3) + # === Local change end === + if logit_scale is not None: + # cosine attention + attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1) + logit_scale = torch.clamp(logit_scale, max=math.log(100.0)).exp() + attn = attn * logit_scale + else: + q = q * (C // num_heads) ** -0.5 + attn = q.matmul(k.transpose(-2, -1)) + # add relative position bias + attn = attn + relative_position_bias + + if sum(shift_size) > 0: + # generate attention mask + attn_mask = x.new_zeros((pad_H, pad_W)) + h_slices = ( + (0, -window_size[0]), + (-window_size[0], -shift_size[0]), + (-shift_size[0], None), + ) + w_slices = ( + (0, -window_size[1]), + (-window_size[1], -shift_size[1]), + (-shift_size[1], None), + ) + count = 0 + for h in h_slices: + for w in w_slices: + attn_mask[h[0] : h[1], w[0] : w[1]] = count + count += 1 + attn_mask = attn_mask.view( + pad_H // window_size[0], + window_size[0], + pad_W // window_size[1], + window_size[1], + ) + attn_mask = attn_mask.permute(0, 2, 1, 3).reshape( + num_windows, window_size[0] * window_size[1] + ) + attn_mask = attn_mask.unsqueeze(1) - attn_mask.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + # ==== Local change begin === + attn = attn.view( + x.size(0) // num_windows, num_windows, num_heads, x.size(1) * x.size(1) + ) + attn = attn + attn_mask.reshape(num_windows, -1).unsqueeze(0).unsqueeze(2) + # ==== Local change end === + attn = attn.view(-1, num_heads, x.size(1), x.size(1)) + + attn = F.softmax(attn, dim=-1) + attn = F.dropout(attn, p=attention_dropout, training=training) + + x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), C) + x = F.linear(x, proj_weight, proj_bias) + x = F.dropout(x, p=dropout, training=training) + + # reverse windows + # Local change begin + x = x.view( + B * pad_H // window_size[0], + pad_W // window_size[1], + window_size[0], + window_size[1] * C, + ) + x = x.permute(0, 2, 1, 3).reshape(B, pad_H, pad_W, C) + # Local change end + + # reverse cyclic shift + if sum(shift_size) > 0: + x = torch.roll(x, shifts=(shift_size[0], shift_size[1]), dims=(1, 2)) + + # unpad features + x = x[:, :H, :W, :].contiguous() + return x + + +class AutoSplitLinear(torch.nn.Module): + def __init__(self, model: torch.nn.Linear): + super().__init__() + self.linear = model + self.weight = model.weight + self.bias = model.bias + + def forward(self, x: Tensor): + if self.linear.in_features > 512 or self.linear.out_features > 512: + x = split_linear(x, self.linear.weight, self.linear.bias, max_channel=512) + else: + x = self.linear(x) + return x diff --git a/qai_hub_models/models/_shared/video_classifier/__init__.py b/qai_hub_models/models/_shared/video_classifier/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/video_classifier/app.py b/qai_hub_models/models/_shared/video_classifier/app.py new file mode 100644 index 00000000..e4052a5d --- /dev/null +++ b/qai_hub_models/models/_shared/video_classifier/app.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +from pathlib import Path +from typing import List, Tuple + +import torch +import torchvision.io + +from qai_hub_models.models._shared.video_classifier.model import KineticsClassifier + + +def normalize(video: torch.Tensor): + """Normalize the video frames. + Parameters: + video: Video tensor (Number of frames x HWC) with values between 0-255 + Channel Layout: RGB + + Returns: + video: Video is normalized to have values between 0-1 + and transposed so the shape is Channel x Number of frames x HW. + """ + return video.permute(3, 0, 1, 2).to(torch.float32) / 255 + + +def resize(video: torch.Tensor, size: Tuple[int, int]): + """ + Interpolate the frames of the image to match model's input resolution. + + Parameters: + video: torch.Tensor + + Returns: + video: Resized video is returned. + Selected settings for resize were recommended. + + """ + return torch.nn.functional.interpolate( + video, size=size, scale_factor=None, mode="bilinear", align_corners=False + ) + + +def crop(video: torch.Tensor, output_size: Tuple[int, int]): + """ + Parameters: + video: torch.Tensor + Input video torch.Tensor. + output_size: desired output shape for each frame. + + Returns: + video: torch.Tensor + Center cropped based on the output size + + """ + h, w = video.shape[-2:] + th, tw = output_size + i = int(round((h - th) / 2.0)) + j = int(round((w - tw) / 2.0)) + return video[..., i : (i + th), j : (j + tw)] + + +def normalize_base( + video: torch.Tensor, mean: List[float], std: List[float] +) -> torch.Tensor: + """ + + Parameters: + video: Input video torch.Tensor + mean: Mean to be subtracted per channel of the input. + std: Standard deviation to be divided per each channel. + + Returns: + video: Normalized based on provided mean and scale. + The operaion is done per channle. + + """ + shape = (-1,) + (1,) * (video.dim() - 1) + mean_tensor = torch.as_tensor(mean).reshape(shape) + std_tensor = torch.as_tensor(std).reshape(shape) + return (video - mean_tensor) / std_tensor + + +def read_video_per_second(path: str) -> torch.Tensor: + """ + + Parameters: + path: Path of the input video. + + Returns: + input_video: Reads video from path and converts to torch tensor. + + """ + input_video, _, _ = torchvision.io.read_video(path, pts_unit="sec") + return input_video + + +def preprocess_video_kinetics_400(input_video: torch.Tensor): + """ + Preprocess the input video correctly for video classification inference. + + Parameters: + input_video: Raw input tensor + + Returns: + video: Normalized, resized, cropped and normalized by channel for input model. + This preprocessing is dd + + """ + mean = [0.43216, 0.394666, 0.37645] + std = [0.22803, 0.22145, 0.216989] + input_video = normalize(input_video) + input_video = resize(input_video, (128, 171)) + input_video = crop(input_video, (112, 112)) + input_video = normalize_base(input_video, mean=mean, std=std) + return input_video + + +def get_class_name_kinetics_400() -> List[str]: + """Return the class name.""" + actions = "abseiling,air drumming,answering questions,applauding,applying cream,archery,arm wrestling,arranging flowers,assembling computer,auctioning,baby waking up,baking cookies,balloon blowing,bandaging,barbequing,bartending,beatboxing,bee keeping,belly dancing,bench pressing,bending back,bending metal,biking through snow,blasting sand,blowing glass,blowing leaves,blowing nose,blowing out candles,bobsledding,bookbinding,bouncing on trampoline,bowling,braiding hair,breading or breadcrumbing,breakdancing,brush painting,brushing hair,brushing teeth,building cabinet,building shed,bungee jumping,busking,canoeing or kayaking,capoeira,carrying baby,cartwheeling,carving pumpkin,catching fish,catching or throwing baseball,catching or throwing frisbee,catching or throwing softball,celebrating,changing oil,changing wheel,checking tires,cheerleading,chopping wood,clapping,clay pottery making,clean and jerk,cleaning floor,cleaning gutters,cleaning pool,cleaning shoes,cleaning toilet,cleaning windows,climbing a rope,climbing ladder,climbing tree,contact juggling,cooking chicken,cooking egg,cooking on campfire,cooking sausages,counting money,country line dancing,cracking neck,crawling baby,crossing river,crying,curling hair,cutting nails,cutting pineapple,cutting watermelon,dancing ballet,dancing charleston,dancing gangnam style,dancing macarena,deadlifting,decorating the christmas tree,digging,dining,disc golfing,diving cliff,dodgeball,doing aerobics,doing laundry,doing nails,drawing,dribbling basketball,drinking,drinking beer,drinking shots,driving car,driving tractor,drop kicking,drumming fingers,dunking basketball,dying hair,eating burger,eating cake,eating carrots,eating chips,eating doughnuts,eating hotdog,eating ice cream,eating spaghetti,eating watermelon,egg hunting,exercising arm,exercising with an exercise ball,extinguishing fire,faceplanting,feeding birds,feeding fish,feeding goats,filling eyebrows,finger snapping,fixing hair,flipping pancake,flying kite,folding clothes,folding napkins,folding paper,front raises,frying vegetables,garbage collecting,gargling,getting a haircut,getting a tattoo,giving or receiving award,golf chipping,golf driving,golf putting,grinding meat,grooming dog,grooming horse,gymnastics tumbling,hammer throw,headbanging,headbutting,high jump,high kick,hitting baseball,hockey stop,holding snake,hopscotch,hoverboarding,hugging,hula hooping,hurdling,hurling (sport),ice climbing,ice fishing,ice skating,ironing,javelin throw,jetskiing,jogging,juggling balls,juggling fire,juggling soccer ball,jumping into pool,jumpstyle dancing,kicking field goal,kicking soccer ball,kissing,kitesurfing,knitting,krumping,laughing,laying bricks,long jump,lunge,making a cake,making a sandwich,making bed,making jewelry,making pizza,making snowman,making sushi,making tea,marching,massaging back,massaging feet,massaging legs,massaging person's head,milking cow,mopping floor,motorcycling,moving furniture,mowing lawn,news anchoring,opening bottle,opening present,paragliding,parasailing,parkour,passing American football (in game),passing American football (not in game),peeling apples,peeling potatoes,petting animal (not cat),petting cat,picking fruit,planting trees,plastering,playing accordion,playing badminton,playing bagpipes,playing basketball,playing bass guitar,playing cards,playing cello,playing chess,playing clarinet,playing controller,playing cricket,playing cymbals,playing didgeridoo,playing drums,playing flute,playing guitar,playing harmonica,playing harp,playing ice hockey,playing keyboard,playing kickball,playing monopoly,playing organ,playing paintball,playing piano,playing poker,playing recorder,playing saxophone,playing squash or racquetball,playing tennis,playing trombone,playing trumpet,playing ukulele,playing violin,playing volleyball,playing xylophone,pole vault,presenting weather forecast,pull ups,pumping fist,pumping gas,punching bag,punching person (boxing),push up,pushing car,pushing cart,pushing wheelchair,reading book,reading newspaper,recording music,riding a bike,riding camel,riding elephant,riding mechanical bull,riding mountain bike,riding mule,riding or walking with horse,riding scooter,riding unicycle,ripping paper,robot dancing,rock climbing,rock scissors paper,roller skating,running on treadmill,sailing,salsa dancing,sanding floor,scrambling eggs,scuba diving,setting table,shaking hands,shaking head,sharpening knives,sharpening pencil,shaving head,shaving legs,shearing sheep,shining shoes,shooting basketball,shooting goal (soccer),shot put,shoveling snow,shredding paper,shuffling cards,side kick,sign language interpreting,singing,situp,skateboarding,ski jumping,skiing (not slalom or crosscountry),skiing crosscountry,skiing slalom,skipping rope,skydiving,slacklining,slapping,sled dog racing,smoking,smoking hookah,snatch weight lifting,sneezing,sniffing,snorkeling,snowboarding,snowkiting,snowmobiling,somersaulting,spinning poi,spray painting,spraying,springboard diving,squat,sticking tongue out,stomping grapes,stretching arm,stretching leg,strumming guitar,surfing crowd,surfing water,sweeping floor,swimming backstroke,swimming breast stroke,swimming butterfly stroke,swing dancing,swinging legs,swinging on something,sword fighting,tai chi,taking a shower,tango dancing,tap dancing,tapping guitar,tapping pen,tasting beer,tasting food,testifying,texting,throwing axe,throwing ball,throwing discus,tickling,tobogganing,tossing coin,tossing salad,training dog,trapezing,trimming or shaving beard,trimming trees,triple jump,tying bow tie,tying knot (not on a tie),tying tie,unboxing,unloading truck,using computer,using remote controller (not gaming),using segway,vault,waiting in line,walking the dog,washing dishes,washing feet,washing hair,washing hands,water skiing,water sliding,watering plants,waxing back,waxing chest,waxing eyebrows,waxing legs,weaving basket,welding,whistling,windsurfing,wrapping present,wrestling,writing,yawning,yoga,zumba" + return actions.split(",") + + +def recognize_action_kinetics_400(prediction: torch.Tensor) -> List[str]: + """ + Return the top 5 class names. + Parameters: + prediction: Get the probability for all classes. + + Returns: + classnames: List of class ids from Kinetics-400 dataset is returned. + + """ + # Get top 5 class probabilities + prediction = torch.topk(prediction.flatten(), 5).indices + + actions = get_class_name_kinetics_400() + return [actions[pred] for pred in prediction] + + +class KineticsClassifierApp: + """ + This class consists of light-weight "app code" that is required to + perform end to end inference with an KineticsClassifier. + + For a given image input, the app will: + * Pre-process the video (resize and normalize) + * Run Video Classification + * Return the probability of each class. + """ + + def __init__(self, model: KineticsClassifier): + self.model = model.eval() + + def predict(self, path: str | Path) -> List[str]: + """ + From the provided path of the video, predict probability distribution + over the 400 Kinetics classes and return the class name. + + Parameters: + path: Path to the raw video + + Returns: + prediction: List[str] with top 5 most probable classes for a given video. + """ + + # Reads the video via provided path + input_video = read_video_per_second(str(path)) + + # Preprocess the video + input_video = preprocess_video_kinetics_400(input_video) + + # Inference using mdoel + raw_prediction = self.model(input_video.unsqueeze(0)) + + return recognize_action_kinetics_400(raw_prediction) diff --git a/qai_hub_models/models/_shared/video_classifier/demo.py b/qai_hub_models/models/_shared/video_classifier/demo.py new file mode 100644 index 00000000..6ba6eb15 --- /dev/null +++ b/qai_hub_models/models/_shared/video_classifier/demo.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +import tempfile +from typing import Type + +from qai_hub_models.models._shared.video_classifier.app import KineticsClassifierApp +from qai_hub_models.models._shared.video_classifier.model import KineticsClassifier +from qai_hub_models.utils.args import get_model_cli_parser, model_from_cli_args +from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_path + + +# +# Run KineticsClassifierApp end-to-end on a sample video. +# The demo will display top classification predictions for the video. +# +def kinetics_classifier_demo( + model_type: Type[KineticsClassifier], + default_video: str | CachedWebAsset, + is_test: bool = False, +): + # Demo parameters + parser = get_model_cli_parser(model_type) + + parser.add_argument( + "--video", type=str, default=default_video, help="video file path or URL." + ) + + args = parser.parse_args([] if is_test else None) + + # Load image & model + model = model_from_cli_args(model_type, args) + app = KineticsClassifierApp(model) + print("Model Loaded") + with tempfile.TemporaryDirectory() as tmpdir: + dst_path = load_path(args.video, tmpdir) + predictions = app.predict(path=str(dst_path)) + top5_classes = ", ".join(predictions) + if not is_test: + print(f"Top 5 predictions: {top5_classes}") diff --git a/qai_hub_models/models/_shared/video_classifier/model.py b/qai_hub_models/models/_shared/video_classifier/model.py new file mode 100644 index 00000000..3caf16e2 --- /dev/null +++ b/qai_hub_models/models/_shared/video_classifier/model.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +DEFAULT_VIDEO_DIM = 112 + + +class KineticsClassifier(BaseModel): + """ + Base class for all Kinetics Classifier models within QAI Hub Models. + """ + + def __init__(self, net: torch.nn.Module): + """ + Basic initializer which takes in a pretrained classifier network. + Subclasses can choose to implement their own __init__ and forward methods. + """ + super().__init__() + self.net = net + + def forward(self, video: torch.Tensor): + """ + Predict class probabilities for an input `video`. + + Parameters: + video: A [C, Number of frames, H, W] video. + Assumes video has been resized and normalized as implemented + in the preprocess_image function in video_preprocessing.py file. + Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + A [1, 400] where each value is the log-likelihood of + the video belonging to the corresponding Kinetics class. + """ + return self.net(video) + + def get_input_spec( + self, + num_frames: int = 16, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + return { + "video": ( + (1, 3, num_frames, DEFAULT_VIDEO_DIM, DEFAULT_VIDEO_DIM), + "float32", + ) + } diff --git a/qai_hub_models/models/_shared/yolo/__init__.py b/qai_hub_models/models/_shared/yolo/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/models/_shared/yolo/app.py b/qai_hub_models/models/_shared/yolo/app.py new file mode 100644 index 00000000..182420f6 --- /dev/null +++ b/qai_hub_models/models/_shared/yolo/app.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +from typing import Callable, List, Tuple + +import numpy as np +import torch +from PIL.Image import Image + +from qai_hub_models.utils.bounding_box_processing import batched_nms +from qai_hub_models.utils.draw import draw_box_from_xyxy +from qai_hub_models.utils.image_processing import app_to_net_image_inputs + + +class YoloObjectDetectionApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference + with Yolo object detection models. + + The app works with following models: + * YoloV7 + * YoloV8Detection + + For a given image input, the app will: + * pre-process the image (convert to range[0, 1]) + * Run Yolo inference + * if requested, post-process YoloV7 output using non maximum suppression + * if requested, draw the predicted bounding boxes on the input image + """ + + def __init__( + self, + model: Callable[ + [torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + ], + nms_score_threshold: float = 0.45, + nms_iou_threshold: float = 0.7, + ): + """ + Initialize a YoloObjectDetectionApp application. + + Parameters: + model: torch.Tensor + Yolo object detection model. + + Inputs: + Tensor of shape (N H W C x float32) with range [0, 1] and BGR channel layout. + + Outputs: + boxes: Tensor of shape [batch, num preds, 4] where 4 == (x1, y1, x2, y2). + The output are in the range of the input image's dimensions (NOT [0-1]) + + scores: Tensor of shape [batch, num_preds, # of classes (typically 80)] + + class_idx: Tensor of shape [num_preds] where the values are the indices + of the most probable class of the prediction. + + nms_score_threshold + Score threshold for non maximum suppression. + + nms_iou_threshold + Intersection over Union threshold for non maximum suppression. + """ + self.model = model + self.nms_score_threshold = nms_score_threshold + self.nms_iou_threshold = nms_iou_threshold + + def check_image_size(self, pixel_values: torch.Tensor) -> None: + """ + Verify image size is valid model input. + """ + raise NotImplementedError + + def predict(self, *args, **kwargs): + # See predict_boxes_from_image. + return self.predict_boxes_from_image(*args, **kwargs) + + def predict_boxes_from_image( + self, + pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], + raw_output: bool = False, + ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]] | List[ + np.ndarray + ]: + """ + From the provided image or tensor, predict the bounding boxes & classes of objects detected within. + + Parameters: + pixel_values_or_image: torch.Tensor + PIL image + or + numpy array (N H W C x uint8) or (H W C x uint8) -- both BGR channel layout + or + pyTorch tensor (N C H W x fp32, value range is [0, 1]), BGR channel layout + + raw_output: bool + See "returns" doc section for details. + + Returns: + If raw_output is false or pixel_values_or_image is not a PIL image, returns: + images: List[np.ndarray] + A list of predicted BGR, [H, W, C] images (one list element per batch). Each image will have bounding boxes drawn. + + Otherwise, returns: + boxes: List[torch.Tensor] + Bounding box locations per batch. List element shape is [num preds, 4] where 4 == (x1, y1, x2, y2) + scores: List[torch.Tensor] + class scores per batch multiplied by confidence: List element shape is [num_preds, # of classes (typically 80)] + class_idx: List[torch.tensor] + Shape is [num_preds] where the values are the indices of the most probable class of the prediction. + """ + + # Input Prep + NHWC_int_numpy_frames, NCHW_fp32_torch_frames = app_to_net_image_inputs( + pixel_values_or_image + ) + self.check_image_size(NCHW_fp32_torch_frames) + + # Run prediction + pred_boxes, pred_scores, pred_class_idx = self.model(NCHW_fp32_torch_frames) + + # Non Maximum Suppression on each batch + pred_boxes, pred_scores, pred_class_idx = batched_nms( + self.nms_iou_threshold, + self.nms_score_threshold, + pred_boxes, + pred_scores, + pred_class_idx, + ) + + # Return raw output if requested + if raw_output or isinstance(pixel_values_or_image, torch.Tensor): + return (pred_boxes, pred_scores, pred_class_idx) + + # Add boxes to each batch + for batch_idx in range(len(pred_boxes)): + pred_boxes_batch = pred_boxes[batch_idx] + for box in pred_boxes_batch: + draw_box_from_xyxy( + NHWC_int_numpy_frames[batch_idx], + box[0:2].int(), + box[2:4].int(), + color=(0, 255, 0), + size=2, + ) + + return NHWC_int_numpy_frames diff --git a/qai_hub_models/models/_shared/yolo/demo.py b/qai_hub_models/models/_shared/yolo/demo.py new file mode 100644 index 00000000..8fb419a9 --- /dev/null +++ b/qai_hub_models/models/_shared/yolo/demo.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from typing import Callable, Type + +from PIL import Image + +from qai_hub_models.models._shared.yolo.app import YoloObjectDetectionApp +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_image +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.display import display_or_save_image +from qai_hub_models.utils.quantization import make_image_sample_data_loader + + +# Run Yolo end-to-end on a sample image. +# The demo will display a image with the predicted bounding boxes. +def yolo_detection_demo( + model_type: Type[BaseModel], + app_type: Callable[..., YoloObjectDetectionApp], + default_image: str | CachedWebAsset, + stride_multiple: int | None = None, + is_test: bool = False, +): + # Demo parameters + parser = get_model_cli_parser(model_type) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + image_help = "image file path or URL." + if stride_multiple: + image_help = f"{image_help} Image spatial dimensions (x and y) must be multiples of {stride_multiple}." + parser.add_argument("--image", type=str, default=default_image, help=image_help) + parser.add_argument( + "--score-threshold", + type=float, + default=0.45, + help="Score threshold for NonMaximumSuppression", + ) + parser.add_argument( + "--iou-threshold", + type=float, + default=0.7, + help="Intersection over Union (IoU) threshold for NonMaximumSuppression", + ) + parser.add_argument( + "--run-quantization", + action="store_true", + help="Set the flag to run post_train_quantization on sample data.", + ) + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, model_type.get_model_id()) + + if args.run_quantization: + # Override default quantized weight to use fp32 weight + args.weights_name = "" + model = demo_model_from_cli_args(model_type, args) + + if args.run_quantization: + # model is not quantized if we load fp32 weights instead of + # int8 weights + calibration_data_loader = make_image_sample_data_loader() + model.post_train_quantization(calibration_data_loader) + + app = app_type(model, args.score_threshold, args.iou_threshold) + print("Model Loaded") + image = load_image(args.image) + pred_images = app.predict_boxes_from_image(image) + out = Image.fromarray(pred_images[0]) + if not is_test: + display_or_save_image(out, args.output_dir, "yolo_demo_output.png") diff --git a/qai_hub_models/models/_shared/yolo/utils.py b/qai_hub_models/models/_shared/yolo/utils.py new file mode 100644 index 00000000..721d66ff --- /dev/null +++ b/qai_hub_models/models/_shared/yolo/utils.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.base_model import InputsType +from qai_hub_models.utils.image_processing import app_to_net_image_inputs + + +def transform_box_layout_xywh2xyxy(boxes: torch.Tensor) -> torch.Tensor: + """ + Convert boxes with (xywh) layout to (xyxy) + + Parameters: + boxes (torch.Tensor): Input boxes with layout (xywh) + + Returns: + torch.Tensor: Output box with layout (xyxy) + i.e. [top_left_x | top_left_y | bot_right_x | bot_right_y] + """ + # Convert to (x1, y1, x2, y2) + # TODO(#8595): Splitting ops into smaller chunks makes them NPU resident + cx = torch.split(boxes[..., 0], 5000, dim=-1) + cy = torch.split(boxes[..., 1], 5000, dim=-1) + w_2 = torch.split(boxes[..., 2] / 2, 5000, dim=-1) + h_2 = torch.split(boxes[..., 3] / 2, 5000, dim=-1) + boxes_splits = [] + for i in range(len(cx)): + top_left_x = cx[i] - w_2[i] + top_left_y = cy[i] - h_2[i] + bot_right_x = cx[i] + w_2[i] + bot_right_y = cy[i] + h_2[i] + boxes = torch.stack((top_left_x, top_left_y, bot_right_x, bot_right_y), -1) + boxes_splits.append(boxes) + return torch.cat(boxes_splits, dim=-2) + + +def detect_postprocess(detector_output: torch.Tensor): + """ + Post processing to break Yolo(v6,v7) detector output into multiple, consumable tensors (eg. for NMS). + such as bounding boxes, classes, and confidence. + + Parameters: + detector_output: torch.Tensor + The output of Yolo Detection model + Shape is [batch, num_preds, k] + where, k = # of classes + 5 + k is structured as follows [boxes (4) : conf (1) : # of classes] + and boxes are co-ordinates [x_center, y_center, w, h] + + Returns: + boxes: torch.Tensor + Bounding box locations. Shape is [batch, num preds, 4] where 4 == (x1, y1, x2, y2) + scores: torch.Tensor + class scores multiplied by confidence: Shape is [batch, num_preds] + class_idx: torch.tensor + Shape is [batch, num_preds, 1] where the last dim is the index of the most probable class of the prediction. + """ + # Break output into parts + boxes = detector_output[:, :, :4] + conf = detector_output[:, :, 4:5] + scores = detector_output[:, :, 5:] + + # Convert boxes to (x1, y1, x2, y2) + boxes = transform_box_layout_xywh2xyxy(boxes) + + # Combine confidence and scores. + scores *= conf + + # Get class ID of most likely score. + scores, class_idx = get_most_likely_score(scores) + + return boxes, scores, class_idx + + +def get_most_likely_score(scores: torch.Tensor): + """ + Returns most likely score and class id + + Args: + scores (torch.tensor): final score after post-processing predictions + + Returns: + scores: torch.Tensor + class scores reduced to keep max score per prediction + Shape is [batch, num_preds] + class_idx: torch.tensor + Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction. + """ + # TODO(#8595): QNN crashes when running max on a large tensor + # Split into chunks of size 5k to keep the model NPU resident + score_splits = torch.split(scores, 5000, dim=-2) + max_scores = [] + max_indices = [] + for split in score_splits: + scores, class_idx = torch.max(split, -1, keepdim=False) + max_scores.append(scores) + max_indices.append(class_idx.float()) + return torch.cat(max_scores, dim=-1), torch.cat(max_indices, dim=-1) + + +def yolo_sample_inputs() -> InputsType: + image_address = CachedWebModelAsset.from_asset_store( + "yolov7", 1, "yolov7_demo_640.jpg" + ) + image = load_image(image_address) + return {"image": [app_to_net_image_inputs(image)[1].numpy()]} diff --git a/qai_hub_models/models/baichuan_7b_quantized/README.md b/qai_hub_models/models/baichuan_7b_quantized/README.md new file mode 100644 index 00000000..ba681122 --- /dev/null +++ b/qai_hub_models/models/baichuan_7b_quantized/README.md @@ -0,0 +1,27 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Baichuan-7B: Large language model achieving state-of-the-art performance on Chinese and English language benchmarks](https://aihub.qualcomm.com/models/baichuan_7b_quantized) + +Baichuan-7B is a family of LLMs. It achieves the state-of-the-art performance of its size on standard Chinese and English authoritative benchmarks (C-EVAL/MMLU). 4-bit weights and 16-bit activations making it suitable for on-device The model is quantized to deployment. For Prompt and output length specified below, the time to first token is Llama-PromptProcessor-Quantized's latency and average time per addition token is Llama-TokenGenerator-KVCache-Quantized's latency. + +This is based on the implementation of Baichuan-7B found +[here](https://github.com/baichuan-inc/Baichuan-7B/). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/baichuan_7b_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Baichuan-7B can be found + [here](https://github.com/baichuan-inc/Baichuan-7B/blob/main/LICENSE). + + +## References +* [Baichuan 2: Open Large-scale Language Models](https://arxiv.org/abs/2309.10305) +* [Source Model Implementation](https://github.com/baichuan-inc/Baichuan-7B/) diff --git a/qai_hub_models/models/baichuan_7b_quantized/info.yaml b/qai_hub_models/models/baichuan_7b_quantized/info.yaml new file mode 100644 index 00000000..4fb26cc4 --- /dev/null +++ b/qai_hub_models/models/baichuan_7b_quantized/info.yaml @@ -0,0 +1,45 @@ +name: Baichuan-7B +id: baichuan_7b_quantized +status: public +headline: Large language model achieving state-of-the-art performance on Chinese and English language benchmarks. +domain: Generative AI +description: Baichuan-7B is a family of LLMs. It achieves the state-of-the-art performance of + its size on standard Chinese and English authoritative benchmarks (C-EVAL/MMLU). + 4-bit weights and 16-bit activations making it suitable for on-device + The model is quantized to deployment. For Prompt and output length specified below, + the time to first token is Llama-PromptProcessor-Quantized's latency and average + time per addition token is Llama-TokenGenerator-KVCache-Quantized's latency. +use_case: Text Generation +tags: + - llm + - generative-ai + - quantized +research_paper: https://arxiv.org/abs/2309.10305 +research_paper_title: "Baichuan 2: Open Large-scale Language Models" +license: https://github.com/baichuan-inc/Baichuan-7B/blob/main/LICENSE +source_repo: https://github.com/baichuan-inc/Baichuan-7B/ +technical_details: + Number of parameters: 7B + Model size: 3.9GB + Model-1 (Prompt Processor): Baichuan-PromptProcessor-Quantized + Max context length: 1024 + Prompt processor input: 1024 tokens + Prompt processor output: 1 output token + KVCache for token generator + Model-2 (Token Generator): Baichuan-TokenGenerator-KVCache-Quantized + Token generator input: 1 input token + past KVCache + Token generator output: 1 output token + KVCache for next iteration + Decoding length: 1024 (1 output token + 1023 from KVCache) + Use: Initiate conversation with prompt-processor and then token generator for subsequent iterations. + QNN-SDK: "2.19" +applicable_scenarios: + - Dialogue + - Content Generation + - Customer Support +related_models: [] +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: [] diff --git a/qai_hub_models/models/baichuan_7b_quantized/perf.yaml b/qai_hub_models/models/baichuan_7b_quantized/perf.yaml new file mode 100644 index 00000000..e0e4d38e --- /dev/null +++ b/qai_hub_models/models/baichuan_7b_quantized/perf.yaml @@ -0,0 +1,77 @@ +models: +- name: Baichuan-TokenGenerator-KVCache-Quantized + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S24 Ultra + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-02-16T22:23:17.643089Z' + torchscript_onnx_qnn: + inference_time: 108059 + throughput: 9.25 + estimated_peak_memory_range: + min: 561152 + max: 112366992 + layer_info: + layers_on_npu: 33820 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 33820 + precision: uint16 + primary_compute_unit: NPU + job_id: "null" + job_status: Passed +- name: Baichuan-PromptProcessor-Quantized + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S24 Ultra + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-02-16T22:23:17.643089Z' + torchscript_onnx_qnn: + inference_time: 2599326 + throughput: 0.38 + estimated_peak_memory_range: + min: 53248 + max: 40255040 + layer_info: + layers_on_npu: 31772 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 31772 + precision: uint16 + primary_compute_unit: NPU + job_id: "null" + job_status: Passed +aggregated: + supported_devices: + - Samsung Galaxy S24 Ultra + supported_oses: + - Android + supported_chipsets: + - Snapdragon® 8 Gen 3 + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S24 Ultra + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-02-16T22:23:17.643089Z' + torchscript_onnx_qnn: + inference_time: 108059 + throughput: 9.25 + estimated_peak_memory_range: + min: 561152 + max: 112366992 + precision: uint16 + primary_compute_unit: NPU + job_id: "" + job_status: Passed diff --git a/qai_hub_models/models/controlnet_quantized/README.md b/qai_hub_models/models/controlnet_quantized/README.md new file mode 100644 index 00000000..bde72516 --- /dev/null +++ b/qai_hub_models/models/controlnet_quantized/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ControlNet: Generating visual arts from text prompt and input guiding image](https://aihub.qualcomm.com/models/controlnet_quantized) + +On-device, high-resolution image synthesis from text and image prompts. ControlNet guides Stable-diffusion with provided input image to generate accurate images from given input prompt. + +This is based on the implementation of ControlNet found +[here](https://github.com/lllyasviel/ControlNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/controlnet_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[controlnet_quantized]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.controlnet_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.controlnet_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ControlNet can be found + [here](https://github.com/lllyasviel/ControlNet/blob/main/LICENSE). + + +## References +* [Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) +* [Source Model Implementation](https://github.com/lllyasviel/ControlNet) diff --git a/qai_hub_models/models/controlnet_quantized/__init__.py b/qai_hub_models/models/controlnet_quantized/__init__.py new file mode 100644 index 00000000..6a85176d --- /dev/null +++ b/qai_hub_models/models/controlnet_quantized/__init__.py @@ -0,0 +1,7 @@ +from qai_hub_models.models.controlnet_quantized.app import ( # noqa: F401 + ControlNetApp as app, +) +from qai_hub_models.models.controlnet_quantized.model import MODEL_ID # noqa: F401 +from qai_hub_models.models.controlnet_quantized.model import ( # noqa: F401 + ControlNetQuantized as Model, +) diff --git a/qai_hub_models/models/controlnet_quantized/app.py b/qai_hub_models/models/controlnet_quantized/app.py new file mode 100644 index 00000000..8902ae37 --- /dev/null +++ b/qai_hub_models/models/controlnet_quantized/app.py @@ -0,0 +1,249 @@ +from typing import Any, Callable, Tuple + +import cv2 +import numpy as np +import torch +from diffusers.models.embeddings import get_timestep_embedding +from PIL import Image +from torchvision import transforms + +OUT_H, OUT_W = 512, 512 + + +class ControlNetApp: + """ + ControlNetApp represents the application code needed to string + together the various neural networks that make up the ControlNet + algorithm. This code is written in Python and pipeline uses PyTorch + while running neural networks on-device. This is meant to serve as a + reference implementation for this application in other languages and + for other platforms. + + Please run the app via `demo.py`. + + References + ---------- + * https://arxiv.org/abs/2302.05543 + * https://github.com/lllyasviel/ControlNet + """ + + def __init__( + self, + text_encoder: Callable[..., Tuple[torch.Tensor, ...]], + vae_decoder: Callable[..., Tuple[torch.Tensor, ...]], + unet: Callable[..., Tuple[torch.Tensor, ...]], + controlnet: Callable[..., Tuple[torch.Tensor, ...]], + tokenizer: Any, + scheduler: Any, + time_embedding: Any, + ): + """ + Initializes ControlNetApp with required neural networks for end-to-end pipeline. + + Parameters + ---------- + text_encoder: + Encoder input text + vae_decoder: + Decoder to decode latent space into output image + unet: + Denoises image in latent space + controlnet: + Conditions denoise w.r.t. input image + tokenizer: + Tokenizer for input text. + Output of Tokenizer is fed to text_encoder. + One can experiments with different tokenizers available based on Clip-ViT. + scheduler: + Solver for diffusion steps. + Updates latent space during each iteration. + time_embeddings: + Projects time-step into embedding used during denoising in latent space. + """ + + self.text_encoder = text_encoder + self.vae_decoder = vae_decoder + self.unet = unet + self.controlnet = controlnet + self.tokenizer = tokenizer + self.scheduler = scheduler + self.time_embedding = time_embedding + + def get_time_embedding(self, timestep): + timestep = torch.tensor([timestep]) + t_emb = get_timestep_embedding(timestep, 320, True, 0) + emb = self.time_embedding(t_emb) + + return emb + + def _make_canny_image(self, input_image: Image): + image = np.asarray(input_image) + + # Get edges for input with Canny Edge Detection + low_threshold = 100 + high_threshold = 200 + + image = cv2.Canny(image, low_threshold, high_threshold) + image = image[:, :, None] + image = np.concatenate([image, image, image], axis=2) + + # Make image channel-first and scale + image = np.transpose(image, (2, 0, 1)) + image = image.astype(np.float32) / 255.0 + torch_image = torch.Tensor(image).unsqueeze(0) + + # Resize input image to supported size + return transforms.Resize(size=(OUT_H, OUT_W))(torch_image) + + def _encode_text_prompt(self, prompt: str) -> torch.Tensor: + """ + Takes a text prompt and returns a tensor with its text embedding. + + Parameters + ---------- + prompt: + The text prompt to encode. + """ + # Tokenize input prompt + text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ) + + max_length = text_input.input_ids.shape[-1] + uncond_input = self.tokenizer( + [""], + padding="max_length", + max_length=max_length, + return_tensors="pt", + ) + + # Embed using the text encoder neural network + # Encode input and empty prompt in one go + print(f"Extracting embeddings (inference on TextEncoder)\n{'-' * 50}") + embeddings = self.text_encoder( + [ + text_input.input_ids.type(torch.int32), + uncond_input.input_ids.type(torch.int32), + ] + ) + cond_embeddings, uncond_embeddings = torch.split(embeddings, 1, 0) + return cond_embeddings, uncond_embeddings + + def predict(self, *args, **kwargs): + # See generate_image. + return self.generate_image(*args, **kwargs) + + def generate_image( + self, + prompt: str, + input_image: Image, + num_steps: int = 5, + seed: int = 0, + guidance_scale: float = 7.5, + ) -> torch.Tensor: + """ + Generate an image using the PyTorch reference neural networks. This + code can be used as a reference for how to glue together the neural + networks in an application. Note that this code relies on a tokenizer + and scheduler from the HuggingFace's diffusers library, so those would + have to be ported to the application as well. + + Parameters + ---------- + prompt: + The text prompt to generate an image from. + num_steps: + The number of steps to run the diffusion process for. Higher value + may lead to better image quality. + input_image: + Path to input image for conditioning image generation. + seed: + The seed to use for the random number generator. + guidance_scale: + Classifier-free guidance is a method that allows us to control how + strongly the image generation is guided by the prompt. This is done + by always processing two samples at once: an unconditional (using a + text embedding of an empty prompt) and a conditional (using a text + embedding of the provided prompt). Given the noise prediction of + both of these, we linearly interpolate between them based on the + guidance_scale. A guidance scale of 0 is the same as using an empty + prompt. A guidance scale of 1 turns off classifier-free guidance + and is computationally less expensive since it only processes one + sample at a time. Intuitively you may think the rest of guidance + scales are between 0 and 1, but it is common to use a scale greater + than 1 as a method of amplifying the prompt's influence on the + image, pushing it further away from the unconditional sample. + + Returns + ------- + torch.Tensor + The generated image in RGB scaled in [0, 1] with tensor shape (H, + W, 3). The height and the width may depend on the underlying Stable + Diffusion version, but is typically 512x512. + """ + + # Encode text prompt + cond_embeddings, uncond_embeddings = self._encode_text_prompt(prompt) + self.scheduler.set_timesteps(num_steps) + self.scheduler.config.prediction_type = "epsilon" + + # Channel last input + latents_shape = (1, 4, OUT_H // 8, OUT_W // 8) + + generator = torch.manual_seed(seed) + latents = torch.randn(latents_shape, generator=generator) + latents = latents * self.scheduler.init_noise_sigma + + # Helper method to go back and forth from channel-first to channel-last + def _make_channel_last_torch(input_tensor): + return torch.permute(input_tensor, [0, 2, 3, 1]) + + def _make_channel_first_torch(input_tensor): + return torch.permute(torch.Tensor(input_tensor), [0, 3, 1, 2]) + + # Get image with edges for conditioning + canny_image = self._make_canny_image(input_image) + canny_image = _make_channel_last_torch(canny_image) + + for i, t in enumerate(self.scheduler.timesteps): + print(f"Step: {i + 1}\n{'-' * 20}") + time_emb = self.get_time_embedding(t) + latent_model_input = self.scheduler.scale_model_input(latents, t) + + latent_model_input = _make_channel_last_torch(latent_model_input) + + # Denoise input + print( + f"Denoising image in latent space (inference on ControlNet)\n{'-' * 50}" + ) + controlnet_out = self.controlnet( + [latent_model_input] * 2, + [time_emb] * 2, + [cond_embeddings, uncond_embeddings], + [canny_image] * 2, + ) + controlnet_out_split = [] + for each in controlnet_out: + controlnet_out_split.append(torch.split(each, 1, 0)) + + print(f"Denoising image in latent space (inference on UNet)\n{'-' * 50}") + noise_pred = self.unet( + [latent_model_input] * 2, + [time_emb] * 2, + [cond_embeddings, uncond_embeddings], + *controlnet_out_split, + ) + noise_cond, noise_uncond = torch.split(noise_pred, 1, 0) + + noise_pred = noise_uncond + guidance_scale * (noise_cond - noise_uncond) + noise_pred = _make_channel_first_torch(noise_pred) + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + print(f"Decoding generated image (inference on VAEDecoder)\n{'-' * 50}") + # Decode generated image from latent space + latents_vae = _make_channel_last_torch(latents) + image = self.vae_decoder(latents_vae) + return image diff --git a/qai_hub_models/models/controlnet_quantized/demo.py b/qai_hub_models/models/controlnet_quantized/demo.py new file mode 100644 index 00000000..c1a18274 --- /dev/null +++ b/qai_hub_models/models/controlnet_quantized/demo.py @@ -0,0 +1,167 @@ +import argparse + +import numpy as np +import qai_hub as hub +from diffusers import DPMSolverMultistepScheduler, UNet2DConditionModel +from PIL import Image +from transformers import CLIPTokenizer + +from qai_hub_models.models.controlnet_quantized.app import ControlNetApp +from qai_hub_models.models.controlnet_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ClipVITTextEncoder, + ControlNet, + Unet, + VAEDecoder, +) +from qai_hub_models.utils.args import add_output_dir_arg +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.base_model import BasePrecompiledModel +from qai_hub_models.utils.display import display_or_save_image +from qai_hub_models.utils.inference import HubModel +from qai_hub_models.utils.qai_hub_helpers import can_access_qualcomm_ai_hub + +INPUT_IMAGE = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/test_bird_image.png" +).fetch() + +DEFAULT_DEMO_PROMPT = "Big colorful bird in high resolution, 4K" +DEFAULT_DEVICE_NAME = "Samsung Galaxy S23 Ultra" + + +def _get_hub_model(input_model: BasePrecompiledModel, device_name=DEFAULT_DEVICE_NAME): + if not can_access_qualcomm_ai_hub(): + raise RuntimeError( + "ControlNet on-device demo requires access to QAI-Hub.\n" + "Please visit https://aihub.qualcomm.com/ and sign-up." + ) + + # Upload model + uploaded_model = hub.upload_model(input_model.get_target_model_path()) + inputs = list(input_model.get_input_spec().keys()) + return HubModel(uploaded_model, inputs, hub.Device(name=device_name)) + + +# Run ControlNet end-to-end on a given prompt and input image. +# The demo will output an AI-generated image based on the given inputs. +def main(is_test: bool = False): + parser = argparse.ArgumentParser() + parser.add_argument( + "--prompt", + type=str, + default=DEFAULT_DEMO_PROMPT, + help="Prompt to generate image from.", + ) + parser.add_argument( + "--image", + type=str, + default=INPUT_IMAGE, + help="Input image to extract edges from.", + ) + parser.add_argument( + "--num-steps", + type=int, + default=5, + help="The number of diffusion iteration steps (higher means better quality).", + ) + parser.add_argument( + "--seed", + type=int, + default=0, + help="Random seed.", + ) + add_output_dir_arg(parser) + parser.add_argument( + "--guidance-scale", + type=float, + default=7.5, + help="Strength of guidance (higher means more influence from prompt).", + ) + parser.add_argument( + "--device-name", + type=str, + default=DEFAULT_DEVICE_NAME, + help="Device to run stable-diffusion demo on.", + ) + args = parser.parse_args([] if is_test else None) + + if not is_test: + print(f"\n{'-' * 100}") + print( + f"** Performing image generation on-device({args.device_name}) with ControlNet - Stable Diffusion **" + ) + print() + print("Prompt:", args.prompt) + print("Image:", args.image) + print("Number of steps:", args.num_steps) + print("Guidance scale:", args.guidance_scale) + print("Seed:", args.seed) + print() + print( + "Note: This reference demo uses significant amounts of memory and may take a few minutes to run." + ) + print(f"{'-' * 100}\n") + + print(f"Downloading model assets\n{'-' * 50}") + # Load components + text_encoder = ClipVITTextEncoder.from_precompiled() + unet = Unet.from_precompiled() + vae_decoder = VAEDecoder.from_precompiled() + controlnet = ControlNet.from_precompiled() + + # Create four HubModel instances to prepare for on-device inference. + # This is similar to initializing PyTorch model to call forward method later. + # Instead of forward, we later submit inference_jobs on QAI-Hub for + # on-device evaluation. + print(f"Uploading model assets on QAI-Hub\n{'-' * 50}") + text_encoder = _get_hub_model(text_encoder, args.device_name) + unet = _get_hub_model(unet, args.device_name) + vae_decoder = _get_hub_model(vae_decoder, args.device_name) + controlnet = _get_hub_model(controlnet, args.device_name) + + # Create tokenizer, scheduler and time_embedding required + # for control-net pipeline. + tokenizer = CLIPTokenizer.from_pretrained( + "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer", revision="main" + ) + + scheduler = DPMSolverMultistepScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + num_train_timesteps=1000, + ) + + embedding = UNet2DConditionModel.from_pretrained( + "runwayml/stable-diffusion-v1-5", subfolder="unet" + ).time_embedding + + # Load Application + app = ControlNetApp( + text_encoder=text_encoder, + vae_decoder=vae_decoder, + unet=unet, + controlnet=controlnet, + tokenizer=tokenizer, + scheduler=scheduler, + time_embedding=embedding, + ) + + # Generate image + image = app.generate_image( + args.prompt, + load_image(args.image), + num_steps=args.num_steps, + seed=args.seed, + guidance_scale=args.guidance_scale, + ) + + pil_img = Image.fromarray(np.round(image.numpy() * 255).astype(np.uint8)[0]) + + if not is_test: + display_or_save_image(pil_img, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/controlnet_quantized/export.py b/qai_hub_models/models/controlnet_quantized/export.py new file mode 100644 index 00000000..fe425524 --- /dev/null +++ b/qai_hub_models/models/controlnet_quantized/export.py @@ -0,0 +1,168 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import warnings +from pathlib import Path +from typing import List, Mapping, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.controlnet_quantized import Model +from qai_hub_models.utils.args import TargetRuntime, export_parser +from qai_hub_models.utils.printing import print_profile_metrics_from_job +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + +ALL_COMPONENTS = ["text_encoder", "unet", "vae_decoder", "controlnet"] +DEFAULT_COMPONENTS = ["text_encoder", "vae_decoder", "unet", "controlnet"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[str, Tuple[Optional[hub.ProfileJob], Optional[hub.InferenceJob]]] | List[ + str +]: + """ + This function accomplishes 5 main tasks: + + 1. Initialize model. + 2. Upload model assets to hub. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Summarizes the results from profiling. + + Each of the last three steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_summary: If set, skips waiting for and summarizing results + from profiling. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_precompiled` + + Returns: + A Mapping from component_name to a 2-tuple of: + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "controlnet_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + component_arg = components + components = components or DEFAULT_COMPONENTS + for component in components: + if component not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "controlnet_quantized", + "ControlNet", + device, + skip_profiling, + skip_inferencing, + False, + skip_summary, + output_path, + TargetRuntime.QNN, + "", + profile_options, + component_arg, + ) + + # 1. Initialize model + print("Initializing model class") + model = Model.from_precompiled() + components_dict = {} + if "text_encoder" in components: + components_dict["text_encoder"] = model.text_encoder + if "unet" in components: + components_dict["unet"] = model.unet + if "vae_decoder" in components: + components_dict["vae_decoder"] = model.vae_decoder + if "controlnet" in components: + components_dict["controlnet"] = model.controlnet + + # 2. Upload model assets to hub + print("Uploading model assets on hub") + uploaded_models = {} + for component_name in components: + uploaded_models[component_name] = hub.upload_model( + components_dict[component_name].get_target_model_path() + ) + + # 3. Profile the model assets on real devices + profile_jobs = {} + if not skip_profiling: + for component_name in components: + print(f"Profiling model {component_name} on a hosted device.") + profile_jobs[component_name] = hub.submit_profile_job( + model=uploaded_models[component_name], + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_jobs = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + sample_inputs = components_dict[component_name].sample_inputs() + inference_jobs[component_name] = hub.submit_inference_job( + model=uploaded_models[component_name], + inputs=sample_inputs, + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 5. Summarize the results from profiling + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + return { + component_name: ( + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser( + model_cls=Model, components=ALL_COMPONENTS, exporting_compiled_model=True + ) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/controlnet_quantized/info.yaml b/qai_hub_models/models/controlnet_quantized/info.yaml new file mode 100644 index 00000000..380c2442 --- /dev/null +++ b/qai_hub_models/models/controlnet_quantized/info.yaml @@ -0,0 +1,36 @@ +name: ControlNet +id: controlnet_quantized +status: public +headline: Generating visual arts from text prompt and input guiding image. +domain: Generative AI +description: On-device, high-resolution image synthesis from text and image prompts. + ControlNet guides Stable-diffusion with provided input image to generate accurate images from given input prompt. +use_case: Image Generation +tags: + - generative-ai + - quantized +research_paper: https://arxiv.org/abs/2302.05543 +research_paper_title: "Adding Conditional Control to Text-to-Image Diffusion Models" +license: https://github.com/lllyasviel/ControlNet/blob/main/LICENSE +source_repo: https://github.com/lllyasviel/ControlNet +technical_details: + Text Encoder Number of parameters: 340M + UNet Number of parameters: 865M + VAE Decoder Number of parameters: 83M + ControlNet Number of parameters: 361M + Model size: 1.4GB + Input: Text prompt and input image as a reference + QNN-SDK: "2.19" +applicable_scenarios: + - Image Generation + - Image Editing + - Content Creation +related_models: + - stable_diffusion_quantized +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: no +dataset: [] +license_type: apache-2.0 diff --git a/qai_hub_models/models/controlnet_quantized/model.py b/qai_hub_models/models/controlnet_quantized/model.py new file mode 100644 index 00000000..30861d7e --- /dev/null +++ b/qai_hub_models/models/controlnet_quantized/model.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +import os + +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import BasePrecompiledModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +QNN_SDK_PREFIX = "QNN219" +TEXT_ENCODER = os.path.join(QNN_SDK_PREFIX, "text_encoder.serialized.bin") +UNET_DIFFUSER = os.path.join(QNN_SDK_PREFIX, "unet.serialized.bin") +VAE_DECODER = os.path.join(QNN_SDK_PREFIX, "vae_decoder.serialized.bin") +CONTROL_NET = os.path.join(QNN_SDK_PREFIX, "controlnet.serialized.bin") + + +class ControlNetQuantized: + """ + ControlNet class consists of + - Text Encoder + - UNet based diffuser + - VAE decoder, and + - ControlNet + + All models are pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, text_encoder, unet, vae_decoder, controlnet) -> None: + self.text_encoder = text_encoder + self.unet = unet + self.vae_decoder = vae_decoder + self.controlnet = controlnet + + @classmethod + def from_precompiled(cls) -> "ControlNetQuantized": + return ControlNetQuantized( + text_encoder=ClipVITTextEncoder.from_precompiled(), + unet=Unet.from_precompiled(), + vae_decoder=VAEDecoder.from_precompiled(), + controlnet=ControlNet.from_precompiled(), + ) + + +class ClipVITTextEncoder(BasePrecompiledModel): + """ + CLIP-ViT based Text Encoder. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, target_model_path) -> None: + self.target_model_path = target_model_path + + @classmethod + def from_precompiled(cls) -> "ClipVITTextEncoder": + text_encoder_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, TEXT_ENCODER + ).fetch() + return ClipVITTextEncoder(text_encoder_path) + + def get_target_model_path(self) -> str: + return self.target_model_path + + def get_input_spec(self) -> InputSpec: + return {"input_1": ((1, 77), "int32")} + + +class Unet(BasePrecompiledModel): + """ + UNet model to denoise image in latent space. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, target_model_path) -> None: + self.target_model_path = target_model_path + + @classmethod + def from_precompiled(cls) -> "Unet": + model_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, UNET_DIFFUSER + ).fetch() + return Unet(model_path) + + def get_target_model_path(self) -> str: + return self.target_model_path + + def get_input_spec(self) -> InputSpec: + return { + "input_1": ((1, 64, 64, 4), "float32"), + "input_2": ((1, 1280), "float32"), + "input_3": ((1, 77, 768), "float32"), + "controlnet_downblock1": ((1, 64, 64, 320), "float32"), + "controlnet_downblock2": ((1, 64, 64, 320), "float32"), + "controlnet_downblock3": ((1, 64, 64, 320), "float32"), + "controlnet_downblock4": ((1, 32, 32, 320), "float32"), + "controlnet_downblock5": ((1, 32, 32, 640), "float32"), + "controlnet_downblock6": ((1, 32, 32, 640), "float32"), + "controlnet_downblock7": ((1, 16, 16, 640), "float32"), + "controlnet_downblock8": ((1, 16, 16, 1280), "float32"), + "controlnet_downblock9": ((1, 16, 16, 1280), "float32"), + "controlnet_downblock10": ((1, 8, 8, 1280), "float32"), + "controlnet_downblock11": ((1, 8, 8, 1280), "float32"), + "controlnet_downblock12": ((1, 8, 8, 1280), "float32"), + "controlnet_midblock": ((1, 8, 8, 1280), "float32"), + } + + +class VAEDecoder(BasePrecompiledModel): + """ + Decodes image from latent into output generated image. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, target_model_path) -> None: + self.target_model_path = target_model_path + + @classmethod + def from_precompiled(cls) -> "VAEDecoder": + model_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, VAE_DECODER + ).fetch() + return VAEDecoder(model_path) + + def get_target_model_path(self) -> str: + return self.target_model_path + + def get_input_spec(self) -> InputSpec: + return {"input_1": ((1, 64, 64, 4), "float32")} + + +class ControlNet(BasePrecompiledModel): + """ + Decodes image from latent into output generated image. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, target_model_path) -> None: + self.target_model_path = target_model_path + + @classmethod + def from_precompiled(cls) -> "ControlNet": + model_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, CONTROL_NET + ).fetch() + return ControlNet(model_path) + + def get_target_model_path(self) -> str: + return self.target_model_path + + def get_input_spec(self) -> InputSpec: + return { + "input_1": ((1, 64, 64, 4), "float32"), + "input_2": ((1, 1280), "float32"), + "input_3": ((1, 77, 768), "float32"), + "input_4": ((1, 512, 512, 3), "float32"), + } diff --git a/qai_hub_models/models/controlnet_quantized/perf.yaml b/qai_hub_models/models/controlnet_quantized/perf.yaml new file mode 100644 index 00000000..1b8fdd4d --- /dev/null +++ b/qai_hub_models/models/controlnet_quantized/perf.yaml @@ -0,0 +1,127 @@ +models: +- name: Text-Encoder-Quantized + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-14T05:29:28.928297Z' + torchscript_onnx_qnn: + inference_time: 11369 + throughput: 87.95 + estimated_peak_memory_range: + min: 57344 + max: 34869152 + layer_info: + layers_on_npu: 570 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 570 + precision: uint16 + primary_compute_unit: NPU + job_id: jz5w40nzg + job_status: Passed +- name: VAE-Decoder-Quantized + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-14T05:29:28.928297Z' + torchscript_onnx_qnn: + inference_time: 386746 + throughput: 2.58 + estimated_peak_memory_range: + min: 122880 + max: 4489392 + layer_info: + layers_on_npu: 409 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 409 + precision: uint16 + primary_compute_unit: NPU + job_id: jnp16kxk5 + job_status: Passed +- name: UNet-Quantized + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-14T05:29:28.928297Z' + torchscript_onnx_qnn: + inference_time: 259981 + throughput: 3.84 + estimated_peak_memory_range: + min: 13058048 + max: 15044232 + layer_info: + layers_on_npu: 5434 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 5434 + precision: uint16 + primary_compute_unit: NPU + job_id: jmg9d7eq5 + job_status: Passed +- name: ControlNet-Quantized + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-14T05:29:28.928297Z' + torchscript_onnx_qnn: + inference_time: 103748 + throughput: 9.63 + estimated_peak_memory_range: + min: 200704 + max: 23278088 + layer_info: + layers_on_npu: 2406 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 2406 + precision: uint16 + primary_compute_unit: NPU + job_id: jw56w9jng + job_status: Passed +aggregated: + supported_devices: + - Samsung Galaxy S23 Ultra + supported_oses: + - Android + supported_chipsets: + - Snapdragon® 8 Gen 2 + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-14T05:29:28.928297Z' + torchscript_onnx_qnn: + inference_time: 761844 + throughput: 1.31 + estimated_peak_memory_range: + min: 13058048 + max: 34869152 + precision: int16 + primary_compute_unit: NPU + job_id: "" + job_status: Passed diff --git a/qai_hub_models/models/controlnet_quantized/requirements.txt b/qai_hub_models/models/controlnet_quantized/requirements.txt new file mode 100644 index 00000000..8d0cd0c6 --- /dev/null +++ b/qai_hub_models/models/controlnet_quantized/requirements.txt @@ -0,0 +1,3 @@ +transformers==4.31.0 +diffusers[torch]==0.21.4 +opencv-python==4.8.1.78 diff --git a/qai_hub_models/models/controlnet_quantized/test.py b/qai_hub_models/models/controlnet_quantized/test.py new file mode 100644 index 00000000..a7923ed4 --- /dev/null +++ b/qai_hub_models/models/controlnet_quantized/test.py @@ -0,0 +1,35 @@ +import tempfile + +import pytest + +from qai_hub_models.models.controlnet_quantized.demo import main as demo_main +from qai_hub_models.models.controlnet_quantized.export import export_model + + +@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") +@pytest.mark.slow_cloud +def test_export(): + with tempfile.TemporaryDirectory() as tmpdir: + exported_jobs = export_model( + # Testing text_encoder as it's smallest model in + # ControlNet pipeline + components=["text_encoder"], + skip_inferencing=True, + skip_downloading=True, + skip_summary=True, + output_dir=tmpdir, + ) + + # NOTE: Not waiting for job to finish + # as it will slow CI down. + # Rather, we should create waiting test and move to nightly. + for jobs in exported_jobs.values(): + profile_job, inference_job = jobs[0], jobs[1] + assert profile_job is not None + assert inference_job is None + + +@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") +@pytest.mark.slow_cloud +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/convnext_tiny/README.md b/qai_hub_models/models/convnext_tiny/README.md new file mode 100644 index 00000000..7f489f79 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ConvNext-Tiny: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/convnext_tiny) + +ConvNextTiny is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ConvNext-Tiny found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/convnext_tiny). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.convnext_tiny.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.convnext_tiny.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ConvNext-Tiny can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py) diff --git a/qai_hub_models/models/convnext_tiny/__init__.py b/qai_hub_models/models/convnext_tiny/__init__.py new file mode 100644 index 00000000..35e617f8 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ConvNextTiny as Model # noqa: F401 diff --git a/qai_hub_models/models/convnext_tiny/demo.py b/qai_hub_models/models/convnext_tiny/demo.py new file mode 100644 index 00000000..b06b2602 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.convnext_tiny.model import ConvNextTiny + + +def main(is_test: bool = False): + imagenet_demo(ConvNextTiny, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/convnext_tiny/export.py b/qai_hub_models/models/convnext_tiny/export.py new file mode 100644 index 00000000..984e56e6 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.convnext_tiny import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "convnext_tiny" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "convnext_tiny", + "ConvNext-Tiny", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/convnext_tiny/info.yaml b/qai_hub_models/models/convnext_tiny/info.yaml new file mode 100644 index 00000000..0f6bc919 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny/info.yaml @@ -0,0 +1,38 @@ +name: ConvNext-Tiny +# id must match with the model dir name in qai_hub_models +id: convnext_tiny +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: ConvNextTiny is a machine learning model that can classify images from + the Imagenet dataset. It can also be used as a backbone in building more complex + models for specific use cases. +use_case: Image Classification +tags: [] +research_paper: https://arxiv.org/abs/2201.03545 +research_paper_title: A ConvNet for the 2020s +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py +technical_details: + Number of parameters: 28.6M + Model size: 109 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: +- Medical Imaging +- Anomaly Detection +- Inventory Management +form_factors: +- Phone +- Tablet +- IoT +related_models: +- mobilenet_v2 +- densenet121 +- googlenet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/convnext_tiny/model.py b/qai_hub_models/models/convnext_tiny/model.py new file mode 100644 index 00000000..935804e4 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class ConvNextTiny(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.convnext_tiny(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/convnext_tiny/perf.yaml b/qai_hub_models/models/convnext_tiny/perf.yaml new file mode 100644 index 00000000..e76fd904 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: ConvNext-Tiny + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 11532.0 + throughput: 86.71522719389525 + estimated_peak_memory_range: + min: 339968 + max: 2817216 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 380 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 380 + job_id: jmg9zykqp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:23:45.273161Z' diff --git a/qai_hub_models/models/convnext_tiny/test.py b/qai_hub_models/models/convnext_tiny/test.py new file mode 100644 index 00000000..0178c7ea --- /dev/null +++ b/qai_hub_models/models/convnext_tiny/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.convnext_tiny.demo import main as demo_main +from qai_hub_models.models.convnext_tiny.model import MODEL_ID, ConvNextTiny + + +def test_task(): + run_imagenet_classifier_test(ConvNextTiny.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(ConvNextTiny.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/ddrnet23_slim/README.md b/qai_hub_models/models/ddrnet23_slim/README.md new file mode 100644 index 00000000..96da4502 --- /dev/null +++ b/qai_hub_models/models/ddrnet23_slim/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [DDRNet23-Slim: Segment images or video by class in real-time on device](https://aihub.qualcomm.com/models/ddrnet23_slim) + +DDRNet23Slim is a machine learning model that segments an image into semantic classes, specifically designed for road-based scenes. It is designed for the application of self-driving cars. + +This is based on the implementation of DDRNet23-Slim found +[here](https://github.com/chenjun2hao/DDRNet.pytorch). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/ddrnet23_slim). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.ddrnet23_slim.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.ddrnet23_slim.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of DDRNet23-Slim can be found + [here](https://github.com/chenjun2hao/DDRNet.pytorch/blob/main/LICENSE). + + +## References +* [Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation of Road Scenes](https://arxiv.org/abs/2101.06085) +* [Source Model Implementation](https://github.com/chenjun2hao/DDRNet.pytorch) diff --git a/qai_hub_models/models/ddrnet23_slim/__init__.py b/qai_hub_models/models/ddrnet23_slim/__init__.py new file mode 100644 index 00000000..00999e79 --- /dev/null +++ b/qai_hub_models/models/ddrnet23_slim/__init__.py @@ -0,0 +1,3 @@ +from .app import DDRNetApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import DDRNet as Model # noqa: F401 diff --git a/qai_hub_models/models/ddrnet23_slim/app.py b/qai_hub_models/models/ddrnet23_slim/app.py new file mode 100644 index 00000000..4aba5451 --- /dev/null +++ b/qai_hub_models/models/ddrnet23_slim/app.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from typing import Callable, List + +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image + +from qai_hub_models.models.ddrnet23_slim.model import NUM_CLASSES +from qai_hub_models.utils.draw import create_color_map +from qai_hub_models.utils.image_processing import ( + app_to_net_image_inputs, + normalize_image_transform, +) + + +class DDRNetApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with DDRNet. + + The app uses 1 model: + * DDRNet + + For a given image input, the app will: + * pre-process the image (convert to range[0, 1]) + * Run DDRNet inference + * Convert the output segmentation mask into a visual representation + * Overlay the segmentation mask onto the image and return it + """ + + def __init__(self, model: Callable[[torch.Tensor], torch.Tensor]): + self.model = model + + def predict(self, *args, **kwargs): + # See segment_image. + return self.segment_image(*args, **kwargs) + + def segment_image( + self, + pixel_values_or_image: torch.Tensor + | np.ndarray + | Image.Image + | List[Image.Image], + raw_output: bool = False, + ) -> List[Image.Image] | np.ndarray: + """ + Return the input image with the segmentation mask overlayed on it. + + Parameters: + pixel_values_or_image + PIL image(s) + or + numpy array (N H W C x uint8) or (H W C x uint8) -- both RGB channel layout + or + pyTorch tensor (N C H W x fp32, value range is [0, 1]), RGB channel layout + + raw_output: bool + See "returns" doc section for details. + + Returns: + If raw_output is true, returns: + masks: np.ndarray + A list of predicted masks. + + Otherwise, returns: + segmented_images: List[PIL.Image] + Images with segmentation map overlaid with an alpha of 0.5. + """ + NHWC_int_numpy_frames, NCHW_fp32_torch_frames = app_to_net_image_inputs( + pixel_values_or_image + ) + input_transform = normalize_image_transform() + NCHW_fp32_torch_frames = input_transform(NCHW_fp32_torch_frames) + + with torch.no_grad(): + # pred_mask is 8x downsampled + pred_masks = self.model(NCHW_fp32_torch_frames) + + # Upsample pred mask to original image size + # Need to upsample in the probability space, not in class labels + pred_masks = F.interpolate( + input=pred_masks, + size=NCHW_fp32_torch_frames.shape[-2:], + mode="bilinear", + align_corners=False, + ) + + if raw_output: + return pred_masks.detach().numpy() + + # Create color map and convert segmentation mask to RGB image + pred_mask_img = torch.argmax(pred_masks, 1) + + # Overlay the segmentation mask on the image. alpha=1 is mask only, + # alpha=0 is image only. + color_map = create_color_map(NUM_CLASSES) + out = [] + for i, img_tensor in enumerate(NHWC_int_numpy_frames): + out.append( + Image.blend( + Image.fromarray(img_tensor), + Image.fromarray(color_map[pred_mask_img[i]]), + alpha=0.5, + ) + ) + return out diff --git a/qai_hub_models/models/ddrnet23_slim/demo.py b/qai_hub_models/models/ddrnet23_slim/demo.py new file mode 100644 index 00000000..84129456 --- /dev/null +++ b/qai_hub_models/models/ddrnet23_slim/demo.py @@ -0,0 +1,48 @@ +from qai_hub_models.models.ddrnet23_slim.app import DDRNetApp +from qai_hub_models.models.ddrnet23_slim.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + DDRNet, +) +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + +INPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_input_image.png" +) + + +# Run DDRNet end-to-end on a sample image. +# The demo will display a image with the predicted segmentation map overlaid. +def main(is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(DDRNet) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=INPUT_IMAGE_ADDRESS, + help="image file path or URL", + ) + args = parser.parse_args([] if is_test else None) + model = demo_model_from_cli_args(DDRNet, args) + validate_on_device_demo_args(args, DDRNet.get_model_id()) + + # Load image & model + image = load_image(args.image) + print("Model Loaded") + + app = DDRNetApp(model) + output = app.segment_image(image)[0] + if not is_test: + display_or_save_image(output, args.output_dir, "ddrnet_demo_output.png") + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ddrnet23_slim/export.py b/qai_hub_models/models/ddrnet23_slim/export.py new file mode 100644 index 00000000..c2e30b7a --- /dev/null +++ b/qai_hub_models/models/ddrnet23_slim/export.py @@ -0,0 +1,193 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.ddrnet23_slim import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "ddrnet23_slim" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "ddrnet23_slim", + "DDRNet23-Slim", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ddrnet23_slim/info.yaml b/qai_hub_models/models/ddrnet23_slim/info.yaml new file mode 100644 index 00000000..0083f858 --- /dev/null +++ b/qai_hub_models/models/ddrnet23_slim/info.yaml @@ -0,0 +1,37 @@ +name: DDRNet23-Slim +# id must match with the model dir name in qai_hub_models +id: ddrnet23_slim +status: public +headline: Segment images or video by class in real-time on device. +domain: Computer Vision +description: DDRNet23Slim is a machine learning model that segments an image into + semantic classes, specifically designed for road-based scenes. It is designed for + the application of self-driving cars. +use_case: Semantic Segmentation +tags: +- real-time +research_paper: https://arxiv.org/abs/2101.06085 +research_paper_title: Deep Dual-resolution Networks for Real-time and Accurate Semantic + Segmentation of Road Scenes +license: https://github.com/chenjun2hao/DDRNet.pytorch/blob/main/LICENSE +source_repo: https://github.com/chenjun2hao/DDRNet.pytorch +technical_details: + Number of parameters: 5.70M + Model size: 21.7 MB + Model checkpoint: DDRNet23s_imagenet.pth + Inference latency: RealTime + Input resolution: 2048x1024 +applicable_scenarios: +- Self-driving cars +related_models: +- unet_segmentation +- fcn_resnet50 +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: mit +dataset: + - cityscapes diff --git a/qai_hub_models/models/ddrnet23_slim/model.py b/qai_hub_models/models/ddrnet23_slim/model.py new file mode 100644 index 00000000..c72c8124 --- /dev/null +++ b/qai_hub_models/models/ddrnet23_slim/model.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +from pathlib import Path + +import torch +import torch.nn as nn + +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, SourceAsRoot +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +DDRNET_SOURCE_REPOSITORY = "https://github.com/chenjun2hao/DDRNet.pytorch" +DDRNET_SOURCE_REPO_COMMIT = "bc0e193e87ead839dbc715c48e6bfb059cf21b27" +MODEL_ID = __name__.split(".")[-2] +# Originally from https://drive.google.com/file/d/1d_K3Af5fKHYwxSo8HkxpnhiekhwovmiP/view +DEFAULT_WEIGHTS = "DDRNet23s_imagenet.pth" +MODEL_ASSET_VERSION = 1 +NUM_CLASSES = 19 + + +class DDRNet(BaseModel): + """Exportable DDRNet image segmenter, end-to-end.""" + + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + @classmethod + def from_pretrained(cls, checkpoint_path: str | None = None): + """Load DDRNetSlim from a weightfile created by the source DDRNetSlim repository.""" + with SourceAsRoot( + DDRNET_SOURCE_REPOSITORY, + DDRNET_SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + ): + bad_init_file = Path("lib/models/__init__.py") + if bad_init_file.exists(): + bad_init_file.unlink() + + from lib.models.ddrnet_23_slim import BasicBlock, DualResNet # type: ignore + + ddrnetslim_model = DualResNet( + BasicBlock, + [2, 2, 2, 2], + num_classes=NUM_CLASSES, + planes=32, + spp_planes=128, + head_planes=64, + # No need to use aux loss for inference + augment=False, + ) + + if not checkpoint_path: + checkpoint_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_WEIGHTS + ).fetch() + + pretrained_dict = torch.load( + checkpoint_path, map_location=torch.device("cpu") + ) + if "state_dict" in pretrained_dict: + pretrained_dict = pretrained_dict["state_dict"] + model_dict = ddrnetslim_model.state_dict() + pretrained_dict = { + k[6:]: v + for k, v in pretrained_dict.items() + if k[6:] in model_dict.keys() + } + model_dict.update(pretrained_dict) + ddrnetslim_model.load_state_dict(model_dict) + + ddrnetslim_model.to(torch.device("cpu")).eval() + + return cls(ddrnetslim_model) + + def forward(self, image: torch.Tensor): + """ + Run DDRNet23_Slim on `image`, and produce a predicted segmented image mask. + + Parameters: + image: Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: BGR + + Returns: + segmented mask per class: Shape [batch, classes, 128, 256] + """ + return self.model(image) + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 1280, + width: int = 640, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. Default resolution is 2048x1024 + so this expects an image where width is twice the height. + """ + return {"image": ((batch_size, num_channels, height, width), "float32")} diff --git a/qai_hub_models/models/ddrnet23_slim/perf.yaml b/qai_hub_models/models/ddrnet23_slim/perf.yaml new file mode 100644 index 00000000..1f405dd9 --- /dev/null +++ b/qai_hub_models/models/ddrnet23_slim/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: DDRNet23-Slim + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 6736.0 + throughput: 148.45605700712588 + estimated_peak_memory_range: + min: 991232 + max: 3246040 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 131 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 131 + job_id: jvgddqv6g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:12:22.404643Z' diff --git a/qai_hub_models/models/ddrnet23_slim/test.py b/qai_hub_models/models/ddrnet23_slim/test.py new file mode 100644 index 00000000..3acde3c7 --- /dev/null +++ b/qai_hub_models/models/ddrnet23_slim/test.py @@ -0,0 +1,46 @@ +import numpy as np + +from qai_hub_models.models.ddrnet23_slim.app import DDRNetApp +from qai_hub_models.models.ddrnet23_slim.demo import INPUT_IMAGE_ADDRESS +from qai_hub_models.models.ddrnet23_slim.demo import main as demo_main +from qai_hub_models.models.ddrnet23_slim.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + DDRNet, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_same, skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_output_image.png" +) + + +# Verify that the output from Torch is as expected. +@skip_clone_repo_check +def test_task(): + app = DDRNetApp(DDRNet.from_pretrained()) + original_image = load_image(INPUT_IMAGE_ADDRESS) + output_image = app.segment_image(original_image)[0] + output_image_oracle = load_image(OUTPUT_IMAGE_ADDRESS) + + assert_most_same( + np.asarray(output_image), np.asarray(output_image_oracle), diff_tol=0.01 + ) + + +@skip_clone_repo_check +def test_trace(): + app = DDRNetApp(DDRNet.from_pretrained().convert_to_torchscript()) + original_image = load_image(INPUT_IMAGE_ADDRESS) + output_image = app.segment_image(original_image)[0] + output_image_oracle = load_image(OUTPUT_IMAGE_ADDRESS) + + assert_most_same( + np.asarray(output_image), np.asarray(output_image_oracle), diff_tol=0.01 + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/deeplabv3_resnet50/README.md b/qai_hub_models/models/deeplabv3_resnet50/README.md new file mode 100644 index 00000000..8d4db7eb --- /dev/null +++ b/qai_hub_models/models/deeplabv3_resnet50/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [DeepLabV3-ResNet50: Deep Convolutional Neural Network model for semantic segmentation](https://aihub.qualcomm.com/models/deeplabv3_resnet50) + +DeepLabV3 is designed for semantic segmentation at multiple scales, trained on the COCO dataset. It uses ResNet50 as a backbone. + +This is based on the implementation of DeepLabV3-ResNet50 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/deeplabv3.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/deeplabv3_resnet50). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.deeplabv3_resnet50.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.deeplabv3_resnet50.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of DeepLabV3-ResNet50 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Rethinking Atrous Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1706.05587) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/deeplabv3.py) diff --git a/qai_hub_models/models/deeplabv3_resnet50/__init__.py b/qai_hub_models/models/deeplabv3_resnet50/__init__.py new file mode 100644 index 00000000..e7d6fea1 --- /dev/null +++ b/qai_hub_models/models/deeplabv3_resnet50/__init__.py @@ -0,0 +1,4 @@ +from qai_hub_models.models._shared.deeplab.app import DeepLabV3App as App # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import DeepLabV3_ResNet50 as Model # noqa: F401 diff --git a/qai_hub_models/models/deeplabv3_resnet50/demo.py b/qai_hub_models/models/deeplabv3_resnet50/demo.py new file mode 100644 index 00000000..2d90b8a1 --- /dev/null +++ b/qai_hub_models/models/deeplabv3_resnet50/demo.py @@ -0,0 +1,23 @@ +from qai_hub_models.models._shared.deeplab.demo import deeplabv3_demo +from qai_hub_models.models.deeplabv3_resnet50.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + NUM_CLASSES, + DeepLabV3_ResNet50, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +# Demo image comes from https://github.com/pytorch/hub/raw/master/images/deeplab1.png +# and has had alpha channel removed for use as input +INPUT_IMAGE_LOCAL_PATH = "deeplabv3_demo.png" +INPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, INPUT_IMAGE_LOCAL_PATH +) + + +def main(is_test: bool = False): + deeplabv3_demo(DeepLabV3_ResNet50, INPUT_IMAGE_ADDRESS, NUM_CLASSES, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/deeplabv3_resnet50/export.py b/qai_hub_models/models/deeplabv3_resnet50/export.py new file mode 100644 index 00000000..3eec126d --- /dev/null +++ b/qai_hub_models/models/deeplabv3_resnet50/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.deeplabv3_resnet50 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "deeplabv3_resnet50" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "deeplabv3_resnet50", + "DeepLabV3-ResNet50", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0,output_1", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0,output_1", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/deeplabv3_resnet50/info.yaml b/qai_hub_models/models/deeplabv3_resnet50/info.yaml new file mode 100644 index 00000000..993b9dcd --- /dev/null +++ b/qai_hub_models/models/deeplabv3_resnet50/info.yaml @@ -0,0 +1,35 @@ +name: DeepLabV3-ResNet50 +# id must match with the model dir name in qai_hub_models +id: deeplabv3_resnet50 +status: public +headline: Deep Convolutional Neural Network model for semantic segmentation. +domain: Computer Vision +use_case: Semantic Segmentation +description: DeepLabV3 is designed for semantic segmentation at multiple scales, trained + on the COCO dataset. It uses ResNet50 as a backbone. +tags: [] +research_paper: https://arxiv.org/abs/1706.05587 +research_paper_title: Rethinking Atrous Convolution for Semantic Image Segmentation +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: + https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/deeplabv3.py +technical_details: + Number of parameters: 42.0M + Model size: 151 MB + Model checkpoint: COCO_WITH_VOC_LABELS_V1 + Input resolution: 224x224 +applicable_scenarios: +- Anomaly Detection +- Inventory Management +related_models: +- sam +- unet_segmentation +- fcn_resnet50 +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: [] diff --git a/qai_hub_models/models/deeplabv3_resnet50/model.py b/qai_hub_models/models/deeplabv3_resnet50/model.py new file mode 100644 index 00000000..a585c338 --- /dev/null +++ b/qai_hub_models/models/deeplabv3_resnet50/model.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import torch +import torchvision.models as tv_models + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.models._shared.deeplab.evaluator import DeepLabV3Evaluator +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_WEIGHTS = "COCO_WITH_VOC_LABELS_V1" +NUM_CLASSES = 21 + + +class DeepLabV3_ResNet50(BaseModel): + """Exportable DeepLabV3_ResNet50 image segmentation applications, end-to-end.""" + + def __init__( + self, + deeplabv3_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = deeplabv3_model + + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> DeepLabV3_ResNet50: + model = tv_models.segmentation.deeplabv3_resnet50(weights=weights).eval() + return cls(model) + + def get_evaluator(self) -> BaseEvaluator: + return DeepLabV3Evaluator(NUM_CLASSES) + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run DeepLabV3_ResNet50 on `image`, and produce a tensor of classes for segmentation + + Parameters: + image: Pixel values pre-processed for model consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + tensor: Bx21xHxW tensor of class logits per pixel + """ + return self.model(image)["out"] + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 224, + width: int = 224, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} diff --git a/qai_hub_models/models/deeplabv3_resnet50/perf.yaml b/qai_hub_models/models/deeplabv3_resnet50/perf.yaml new file mode 100644 index 00000000..eeccef3a --- /dev/null +++ b/qai_hub_models/models/deeplabv3_resnet50/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: DeepLabV3-ResNet50 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 57759.0 + throughput: 17.313319136411643 + estimated_peak_memory_range: + min: 12288 + max: 171360368 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 96 + layers_on_cpu: 0 + total_layers: 96 + job_id: jqp4ydxqp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 146022.0 + throughput: 6.848283135417951 + estimated_peak_memory_range: + min: 806912 + max: 9532744 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 82 + layers_on_cpu: 0 + total_layers: 82 + job_id: j0pxl67jp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:31:27.279356Z' diff --git a/qai_hub_models/models/deeplabv3_resnet50/test.py b/qai_hub_models/models/deeplabv3_resnet50/test.py new file mode 100644 index 00000000..d7592338 --- /dev/null +++ b/qai_hub_models/models/deeplabv3_resnet50/test.py @@ -0,0 +1,57 @@ +import numpy as np + +from qai_hub_models.models._shared.deeplab.app import DeepLabV3App +from qai_hub_models.models.deeplabv3_resnet50.demo import INPUT_IMAGE_ADDRESS +from qai_hub_models.models.deeplabv3_resnet50.demo import main as demo_main +from qai_hub_models.models.deeplabv3_resnet50.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + NUM_CLASSES, + DeepLabV3_ResNet50, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check + +OUTPUT_IMAGE_LOCAL_PATH = "deeplabv3_demo_output.png" +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, OUTPUT_IMAGE_LOCAL_PATH +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(INPUT_IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = DeepLabV3App(DeepLabV3_ResNet50.from_pretrained(), num_classes=NUM_CLASSES) + app_output_image = app.predict(image, False) + + np.testing.assert_allclose( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_trace(): + image = load_image(INPUT_IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = DeepLabV3App( + DeepLabV3_ResNet50.from_pretrained().convert_to_torchscript(), + num_classes=NUM_CLASSES, + ) + app_output_image = app.predict(image, False) + + assert_most_close( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/densenet121/README.md b/qai_hub_models/models/densenet121/README.md new file mode 100644 index 00000000..5636d301 --- /dev/null +++ b/qai_hub_models/models/densenet121/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [DenseNet-121: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/densenet121) + +Densenet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of DenseNet-121 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/densenet121). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.densenet121.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.densenet121.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of DenseNet-121 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Densely Connected Convolutional Networks](https://arxiv.org/abs/1608.06993) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py) diff --git a/qai_hub_models/models/densenet121/__init__.py b/qai_hub_models/models/densenet121/__init__.py new file mode 100644 index 00000000..87fce8ae --- /dev/null +++ b/qai_hub_models/models/densenet121/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import DenseNet as Model # noqa: F401 diff --git a/qai_hub_models/models/densenet121/demo.py b/qai_hub_models/models/densenet121/demo.py new file mode 100644 index 00000000..d12e081e --- /dev/null +++ b/qai_hub_models/models/densenet121/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.densenet121.model import DenseNet + + +def main(is_test: bool = False): + imagenet_demo(DenseNet, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/densenet121/export.py b/qai_hub_models/models/densenet121/export.py new file mode 100644 index 00000000..c1dbf904 --- /dev/null +++ b/qai_hub_models/models/densenet121/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.densenet121 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "densenet121" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "densenet121", + "DenseNet-121", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/densenet121/info.yaml b/qai_hub_models/models/densenet121/info.yaml new file mode 100644 index 00000000..c702229f --- /dev/null +++ b/qai_hub_models/models/densenet121/info.yaml @@ -0,0 +1,39 @@ +name: DenseNet-121 +# id must match with the model dir name in qai_hub_models +id: densenet121 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: Densenet is a machine learning model that can classify images from the + Imagenet dataset. It can also be used as a backbone in building more complex models + for specific use cases. +use_case: Image Classification +tags: +- backbone +research_paper: https://arxiv.org/abs/1608.06993 +research_paper_title: Densely Connected Convolutional Networks +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py +technical_details: + Number of parameters: 7.98M + Model size: 30.6 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: +- Medical Imaging +- Anomaly Detection +- Inventory Management +related_models: +- mobilenet_v2 +- squeezenet1_1 +- googlenet +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/densenet121/model.py b/qai_hub_models/models/densenet121/model.py new file mode 100644 index 00000000..01a1448f --- /dev/null +++ b/qai_hub_models/models/densenet121/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class DenseNet(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.densenet121(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/densenet121/perf.yaml b/qai_hub_models/models/densenet121/perf.yaml new file mode 100644 index 00000000..ed91b04c --- /dev/null +++ b/qai_hub_models/models/densenet121/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: DenseNet-121 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1605.0 + throughput: 623.0529595015577 + estimated_peak_memory_range: + min: 28672 + max: 20688688 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 310 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 310 + job_id: jlpe7w275 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1449.0 + throughput: 690.1311249137336 + estimated_peak_memory_range: + min: 73728 + max: 209142552 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 371 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 371 + job_id: jygzljwz5 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:21:38.990133Z' diff --git a/qai_hub_models/models/densenet121/test.py b/qai_hub_models/models/densenet121/test.py new file mode 100644 index 00000000..b2671152 --- /dev/null +++ b/qai_hub_models/models/densenet121/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.densenet121.demo import main as demo_main +from qai_hub_models.models.densenet121.model import MODEL_ID, DenseNet + + +def test_task(): + run_imagenet_classifier_test(DenseNet.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(DenseNet.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/detr_resnet101/README.md b/qai_hub_models/models/detr_resnet101/README.md new file mode 100644 index 00000000..7151e680 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [DETR-ResNet101: Transformer based object detector with ResNet101 backbone](https://aihub.qualcomm.com/models/detr_resnet101) + +DETR is a machine learning model that can detect objects (trained on COCO dataset). + +This is based on the implementation of DETR-ResNet101 found +[here](https://github.com/facebookresearch/detr). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/detr_resnet101). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[detr_resnet101]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.detr_resnet101.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.detr_resnet101.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of DETR-ResNet101 can be found + [here](https://github.com/facebookresearch/detr/blob/main/LICENSE). + + +## References +* [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) +* [Source Model Implementation](https://github.com/facebookresearch/detr) diff --git a/qai_hub_models/models/detr_resnet101/__init__.py b/qai_hub_models/models/detr_resnet101/__init__.py new file mode 100644 index 00000000..96d0c825 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.detr.app import DETRApp as App # noqa: F401 +from qai_hub_models.models.detr_resnet101.model import ( # noqa: F401 + DETRResNet101 as Model, +) + +from .model import MODEL_ID # noqa: F401 diff --git a/qai_hub_models/models/detr_resnet101/demo.py b/qai_hub_models/models/detr_resnet101/demo.py new file mode 100644 index 00000000..34e0c2c6 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101/demo.py @@ -0,0 +1,22 @@ +from qai_hub_models.models._shared.detr.demo import detr_demo +from qai_hub_models.models.detr_resnet101.model import ( + DEFAULT_WEIGHTS, + MODEL_ASSET_VERSION, + MODEL_ID, + DETRResNet101, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "detr_demo_image.jpg" +) + + +# Run DETR app end-to-end on a sample image. +# The demo will display the predicted mask in a window. +def main(is_test: bool = False): + detr_demo(DETRResNet101, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/detr_resnet101/export.py b/qai_hub_models/models/detr_resnet101/export.py new file mode 100644 index 00000000..1eac864d --- /dev/null +++ b/qai_hub_models/models/detr_resnet101/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.detr_resnet101 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "detr_resnet101" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "detr_resnet101", + "DETR-ResNet101", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/detr_resnet101/info.yaml b/qai_hub_models/models/detr_resnet101/info.yaml new file mode 100644 index 00000000..a8c7537a --- /dev/null +++ b/qai_hub_models/models/detr_resnet101/info.yaml @@ -0,0 +1,36 @@ +name: DETR-ResNet101 +# id must match with the model dir name in qai_hub_models +id: detr_resnet101 +status: public +tags: [] +headline: Transformer based object detector with ResNet101 backbone. +domain: Computer Vision +description: DETR is a machine learning model that can detect objects (trained on + COCO dataset). +use_case: Object Detection +research_paper: https://arxiv.org/abs/2005.12872 +research_paper_title: End-to-End Object Detection with Transformers +license: https://github.com/facebookresearch/detr/blob/main/LICENSE +source_repo: https://github.com/facebookresearch/detr +technical_details: + Number of parameters: 60.2M + Model size: 230 MB + Model checkpoint: ResNet101 + Input resolution: 480x480 +applicable_scenarios: +- Factory Automation +- Robotic Navigation +- Camera +related_models: +- detr_resnet50 +- detr_resnet50_dc5 +- detr_resnet101_dc5 +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: + - detection-datasets/coco diff --git a/qai_hub_models/models/detr_resnet101/model.py b/qai_hub_models/models/detr_resnet101/model.py new file mode 100644 index 00000000..2e41e59d --- /dev/null +++ b/qai_hub_models/models/detr_resnet101/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.detr.model import DETR + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "facebook/detr-resnet-101" +MODEL_ASSET_VERSION = 1 + + +class DETRResNet101(DETR): + """Exportable DETR model, end-to-end.""" + + @classmethod + def from_pretrained(cls, ckpt_name: str = DEFAULT_WEIGHTS): + return DETR.from_pretrained(ckpt_name) diff --git a/qai_hub_models/models/detr_resnet101/perf.yaml b/qai_hub_models/models/detr_resnet101/perf.yaml new file mode 100644 index 00000000..86375675 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: DETR-ResNet101 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 640294.0 + throughput: 1.5617825561382739 + estimated_peak_memory_range: + min: 107266048 + max: 111542968 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 957 + total_layers: 957 + job_id: jz5wl39zp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:15:32.226652Z' diff --git a/qai_hub_models/models/detr_resnet101/requirements.txt b/qai_hub_models/models/detr_resnet101/requirements.txt new file mode 100644 index 00000000..3582ec2c --- /dev/null +++ b/qai_hub_models/models/detr_resnet101/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.31.0 +timm==0.9.7 diff --git a/qai_hub_models/models/detr_resnet101/test.py b/qai_hub_models/models/detr_resnet101/test.py new file mode 100644 index 00000000..0a1c7e40 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101/test.py @@ -0,0 +1,27 @@ +from qai_hub_models.models._shared.detr.app import DETRApp +from qai_hub_models.models.detr_resnet101.demo import MODEL_ASSET_VERSION, MODEL_ID +from qai_hub_models.models.detr_resnet101.demo import main as demo_main +from qai_hub_models.models.detr_resnet101.model import DEFAULT_WEIGHTS, DETRResNet101 +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "detr_test_image.jpg" +) + + +def test_task(): + net = DETRResNet101.from_pretrained(DEFAULT_WEIGHTS) + img = load_image(IMAGE_ADDRESS) + _, _, label, _ = DETRApp(net).predict(img, DEFAULT_WEIGHTS) + assert set(list(label.numpy())) == {75, 63, 17} + + +def test_trace(): + net = DETRResNet101.from_pretrained(DEFAULT_WEIGHTS).convert_to_torchscript() + img = load_image(IMAGE_ADDRESS) + _, _, label, _ = DETRApp(net).predict(img, DEFAULT_WEIGHTS) + assert set(list(label.numpy())) == {75, 63, 17} + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/detr_resnet101_dc5/README.md b/qai_hub_models/models/detr_resnet101_dc5/README.md new file mode 100644 index 00000000..bc12519e --- /dev/null +++ b/qai_hub_models/models/detr_resnet101_dc5/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [DETR-ResNet101-DC5: Transformer based object detector with ResNet101 backbone (dilated C5 stage)](https://aihub.qualcomm.com/models/detr_resnet101_dc5) + +DETR is a machine learning model that can detect objects (trained on COCO dataset). + +This is based on the implementation of DETR-ResNet101-DC5 found +[here](https://github.com/facebookresearch/detr). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/detr_resnet101_dc5). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[detr_resnet101_dc5]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.detr_resnet101_dc5.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.detr_resnet101_dc5.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of DETR-ResNet101-DC5 can be found + [here](https://github.com/facebookresearch/detr/blob/main/LICENSE). + + +## References +* [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) +* [Source Model Implementation](https://github.com/facebookresearch/detr) diff --git a/qai_hub_models/models/detr_resnet101_dc5/__init__.py b/qai_hub_models/models/detr_resnet101_dc5/__init__.py new file mode 100644 index 00000000..30449941 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101_dc5/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.detr.app import DETRApp as App # noqa: F401 +from qai_hub_models.models.detr_resnet101_dc5.model import ( # noqa: F401 + DETRResNet101DC5 as Model, +) + +from .model import MODEL_ID # noqa: F401 diff --git a/qai_hub_models/models/detr_resnet101_dc5/demo.py b/qai_hub_models/models/detr_resnet101_dc5/demo.py new file mode 100644 index 00000000..fcaa3b48 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101_dc5/demo.py @@ -0,0 +1,22 @@ +from qai_hub_models.models._shared.detr.demo import detr_demo +from qai_hub_models.models.detr_resnet101_dc5.model import ( + DEFAULT_WEIGHTS, + MODEL_ASSET_VERSION, + MODEL_ID, + DETRResNet101DC5, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "detr_demo_image.jpg" +) + + +# Run DETR app end-to-end on a sample image. +# The demo will display the predicted mask in a window. +def main(is_test: bool = False): + detr_demo(DETRResNet101DC5, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/detr_resnet101_dc5/export.py b/qai_hub_models/models/detr_resnet101_dc5/export.py new file mode 100644 index 00000000..2068b0f7 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101_dc5/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.detr_resnet101_dc5 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "detr_resnet101_dc5" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "detr_resnet101_dc5", + "DETR-ResNet101-DC5", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/detr_resnet101_dc5/info.yaml b/qai_hub_models/models/detr_resnet101_dc5/info.yaml new file mode 100644 index 00000000..e59343af --- /dev/null +++ b/qai_hub_models/models/detr_resnet101_dc5/info.yaml @@ -0,0 +1,36 @@ +name: DETR-ResNet101-DC5 +# id must match with the model dir name in qai_hub_models +id: detr_resnet101_dc5 +status: public +tags: [] +headline: Transformer based object detector with ResNet101 backbone (dilated C5 stage). +domain: Computer Vision +description: DETR is a machine learning model that can detect objects (trained on + COCO dataset). +use_case: Object Detection +research_paper: https://arxiv.org/abs/2005.12872 +research_paper_title: End-to-End Object Detection with Transformers +license: https://github.com/facebookresearch/detr/blob/main/LICENSE +source_repo: https://github.com/facebookresearch/detr +technical_details: + Number of parameters: 60.2M + Model size: 231 MB + Model checkpoint: ResNet101-DC5 + Input resolution: 480x480 +applicable_scenarios: +- Factory Automation +- Robotic Navigation +- Camera +related_models: +- detr_resnet50 +- detr_resnet50_dc5 +- detr_resnet101 +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: + - detection-datasets/coco diff --git a/qai_hub_models/models/detr_resnet101_dc5/model.py b/qai_hub_models/models/detr_resnet101_dc5/model.py new file mode 100644 index 00000000..c12ed734 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101_dc5/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.detr.model import DETR + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "facebook/detr-resnet-101-dc5" +MODEL_ASSET_VERSION = 1 + + +class DETRResNet101DC5(DETR): + """Exportable DETR model, end-to-end.""" + + @classmethod + def from_pretrained(cls, ckpt_name: str = DEFAULT_WEIGHTS): + return DETR.from_pretrained(ckpt_name) diff --git a/qai_hub_models/models/detr_resnet101_dc5/perf.yaml b/qai_hub_models/models/detr_resnet101_dc5/perf.yaml new file mode 100644 index 00000000..3412b2d5 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101_dc5/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: DETR-ResNet101-DC5 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 971988.0 + throughput: 1.0288192858348046 + estimated_peak_memory_range: + min: 12288 + max: 291526464 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 958 + total_layers: 958 + job_id: jlpe7w875 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:40:02.166898Z' diff --git a/qai_hub_models/models/detr_resnet101_dc5/requirements.txt b/qai_hub_models/models/detr_resnet101_dc5/requirements.txt new file mode 100644 index 00000000..3582ec2c --- /dev/null +++ b/qai_hub_models/models/detr_resnet101_dc5/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.31.0 +timm==0.9.7 diff --git a/qai_hub_models/models/detr_resnet101_dc5/test.py b/qai_hub_models/models/detr_resnet101_dc5/test.py new file mode 100644 index 00000000..4cbb0e44 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101_dc5/test.py @@ -0,0 +1,32 @@ +from qai_hub_models.models._shared.detr.app import DETRApp +from qai_hub_models.models.detr_resnet101_dc5.demo import IMAGE_ADDRESS +from qai_hub_models.models.detr_resnet101_dc5.demo import main as demo_main +from qai_hub_models.models.detr_resnet101_dc5.model import ( + DEFAULT_WEIGHTS, + MODEL_ASSET_VERSION, + MODEL_ID, + DETRResNet101DC5, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "detr_test_image.jpg" +) + + +def test_task(): + net = DETRResNet101DC5.from_pretrained(DEFAULT_WEIGHTS) + img = load_image(IMAGE_ADDRESS) + _, _, label, _ = DETRApp(net).predict(img, DEFAULT_WEIGHTS) + assert set(list(label.numpy())) == {75, 63, 17} + + +def test_trace(): + net = DETRResNet101DC5.from_pretrained(DEFAULT_WEIGHTS).convert_to_torchscript() + img = load_image(IMAGE_ADDRESS) + _, _, label, _ = DETRApp(net).predict(img, DEFAULT_WEIGHTS) + assert set(list(label.numpy())) == {75, 63, 17} + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/detr_resnet50/README.md b/qai_hub_models/models/detr_resnet50/README.md new file mode 100644 index 00000000..7a9910a6 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [DETR-ResNet50: Transformer based object detector with ResNet50 backbone](https://aihub.qualcomm.com/models/detr_resnet50) + +DETR is a machine learning model that can detect objects (trained on COCO dataset). + +This is based on the implementation of DETR-ResNet50 found +[here](https://github.com/facebookresearch/detr). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/detr_resnet50). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[detr_resnet50]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.detr_resnet50.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.detr_resnet50.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of DETR-ResNet50 can be found + [here](https://github.com/facebookresearch/detr/blob/main/LICENSE). + + +## References +* [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) +* [Source Model Implementation](https://github.com/facebookresearch/detr) diff --git a/qai_hub_models/models/detr_resnet50/__init__.py b/qai_hub_models/models/detr_resnet50/__init__.py new file mode 100644 index 00000000..5380172b --- /dev/null +++ b/qai_hub_models/models/detr_resnet50/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.detr.app import DETRApp as App # noqa: F401 +from qai_hub_models.models.detr_resnet50.model import ( # noqa: F401 + DETRResNet50 as Model, +) + +from .model import MODEL_ID # noqa: F401 diff --git a/qai_hub_models/models/detr_resnet50/demo.py b/qai_hub_models/models/detr_resnet50/demo.py new file mode 100644 index 00000000..36cf7f70 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50/demo.py @@ -0,0 +1,22 @@ +from qai_hub_models.models._shared.detr.demo import detr_demo +from qai_hub_models.models.detr_resnet50.model import ( + DEFAULT_WEIGHTS, + MODEL_ASSET_VERSION, + MODEL_ID, + DETRResNet50, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "detr_demo_image.jpg" +) + + +# Run DETR app end-to-end on a sample image. +# The demo will display the predicted mask in a window. +def main(is_test: bool = False): + detr_demo(DETRResNet50, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/detr_resnet50/export.py b/qai_hub_models/models/detr_resnet50/export.py new file mode 100644 index 00000000..d3f93926 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.detr_resnet50 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "detr_resnet50" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "detr_resnet50", + "DETR-ResNet50", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/detr_resnet50/info.yaml b/qai_hub_models/models/detr_resnet50/info.yaml new file mode 100644 index 00000000..9cf94087 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50/info.yaml @@ -0,0 +1,36 @@ +name: DETR-ResNet50 +# id must match with the model dir name in qai_hub_models +id: detr_resnet50 +status: public +tags: [] +headline: Transformer based object detector with ResNet50 backbone. +domain: Computer Vision +description: DETR is a machine learning model that can detect objects (trained on + COCO dataset). +use_case: Object Detection +research_paper: https://arxiv.org/abs/2005.12872 +research_paper_title: End-to-End Object Detection with Transformers +license: https://github.com/facebookresearch/detr/blob/main/LICENSE +source_repo: https://github.com/facebookresearch/detr +technical_details: + Number of parameters: 41.3M + Model size: 158 MB + Model checkpoint: ResNet50 + Input resolution: 480x480 +applicable_scenarios: +- Factory Automation +- Robotic Navigation +- Camera +related_models: +- detr_resnet50_dc5 +- detr_resnet101_dc5 +- detr_resnet101 +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: + - detection-datasets/coco diff --git a/qai_hub_models/models/detr_resnet50/model.py b/qai_hub_models/models/detr_resnet50/model.py new file mode 100644 index 00000000..601967e6 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.detr.model import DETR + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "facebook/detr-resnet-50" +MODEL_ASSET_VERSION = 1 + + +class DETRResNet50(DETR): + """Exportable DETR model, end-to-end.""" + + @classmethod + def from_pretrained(cls, ckpt_name: str = DEFAULT_WEIGHTS): + return DETR.from_pretrained(ckpt_name) diff --git a/qai_hub_models/models/detr_resnet50/perf.yaml b/qai_hub_models/models/detr_resnet50/perf.yaml new file mode 100644 index 00000000..a0f2be6b --- /dev/null +++ b/qai_hub_models/models/detr_resnet50/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: DETR-ResNet50 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 346284.0 + throughput: 2.887803074932714 + estimated_peak_memory_range: + min: 109121536 + max: 112011896 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 889 + total_layers: 889 + job_id: jvgddqrkg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:35:06.340774Z' diff --git a/qai_hub_models/models/detr_resnet50/requirements.txt b/qai_hub_models/models/detr_resnet50/requirements.txt new file mode 100644 index 00000000..3582ec2c --- /dev/null +++ b/qai_hub_models/models/detr_resnet50/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.31.0 +timm==0.9.7 diff --git a/qai_hub_models/models/detr_resnet50/test.py b/qai_hub_models/models/detr_resnet50/test.py new file mode 100644 index 00000000..90fc89ee --- /dev/null +++ b/qai_hub_models/models/detr_resnet50/test.py @@ -0,0 +1,43 @@ +from qai_hub_models.models._shared.detr.app import DETRApp +from qai_hub_models.models.detr_resnet50.demo import main as demo_main +from qai_hub_models.models.detr_resnet50.model import ( + DEFAULT_WEIGHTS, + MODEL_ASSET_VERSION, + MODEL_ID, + DETRResNet50, +) +from qai_hub_models.utils.args import get_model_cli_parser, model_from_cli_args +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image + +EXPECTED_OUTPUT = {75, 63, 17} + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "detr_test_image.jpg" +) + + +def test_task(): + net = DETRResNet50.from_pretrained() + img = load_image(IMAGE_ADDRESS) + _, _, label, _ = DETRApp(net).predict(img, DEFAULT_WEIGHTS) + assert set(list(label.numpy())) == EXPECTED_OUTPUT + + +def test_cli_from_pretrained(): + args = get_model_cli_parser(DETRResNet50).parse_args([]) + assert model_from_cli_args(DETRResNet50, args) is not None + + +def test_trace(): + net = DETRResNet50.from_pretrained() + input_spec = net.get_input_spec() + trace = net.convert_to_torchscript(input_spec) + + img = load_image(IMAGE_ADDRESS) + _, _, label, _ = DETRApp(trace).predict(img, DEFAULT_WEIGHTS) + assert set(list(label.numpy())) == EXPECTED_OUTPUT + + +def test_demo(): + # Run demo and verify it does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/detr_resnet50_dc5/README.md b/qai_hub_models/models/detr_resnet50_dc5/README.md new file mode 100644 index 00000000..93283e40 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50_dc5/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [DETR-ResNet50-DC5: Transformer based object detector with ResNet50 backbone (dilated C5 stage)](https://aihub.qualcomm.com/models/detr_resnet50_dc5) + +DETR is a machine learning model that can detect objects (trained on COCO dataset). + +This is based on the implementation of DETR-ResNet50-DC5 found +[here](https://github.com/facebookresearch/detr). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/detr_resnet50_dc5). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[detr_resnet50_dc5]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.detr_resnet50_dc5.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.detr_resnet50_dc5.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of DETR-ResNet50-DC5 can be found + [here](https://github.com/facebookresearch/detr/blob/main/LICENSE). + + +## References +* [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) +* [Source Model Implementation](https://github.com/facebookresearch/detr) diff --git a/qai_hub_models/models/detr_resnet50_dc5/__init__.py b/qai_hub_models/models/detr_resnet50_dc5/__init__.py new file mode 100644 index 00000000..a71e53a0 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50_dc5/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.detr.app import DETRApp as App # noqa: F401 +from qai_hub_models.models.detr_resnet50_dc5.model import ( # noqa: F401 + DETRResNet50DC5 as Model, +) + +from .model import MODEL_ID # noqa: F401 diff --git a/qai_hub_models/models/detr_resnet50_dc5/demo.py b/qai_hub_models/models/detr_resnet50_dc5/demo.py new file mode 100644 index 00000000..2a90f7c4 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50_dc5/demo.py @@ -0,0 +1,22 @@ +from qai_hub_models.models._shared.detr.demo import detr_demo +from qai_hub_models.models.detr_resnet50_dc5.model import ( + DEFAULT_WEIGHTS, + MODEL_ASSET_VERSION, + MODEL_ID, + DETRResNet50DC5, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "detr_demo_image.jpg" +) + + +# Run DETR app end-to-end on a sample image. +# The demo will display the predicted mask in a window. +def main(is_test: bool = False): + detr_demo(DETRResNet50DC5, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/detr_resnet50_dc5/export.py b/qai_hub_models/models/detr_resnet50_dc5/export.py new file mode 100644 index 00000000..67d6c6ac --- /dev/null +++ b/qai_hub_models/models/detr_resnet50_dc5/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.detr_resnet50_dc5 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "detr_resnet50_dc5" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "detr_resnet50_dc5", + "DETR-ResNet50-DC5", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/detr_resnet50_dc5/info.yaml b/qai_hub_models/models/detr_resnet50_dc5/info.yaml new file mode 100644 index 00000000..47a9c223 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50_dc5/info.yaml @@ -0,0 +1,36 @@ +name: DETR-ResNet50-DC5 +# id must match with the model dir name in qai_hub_models +id: detr_resnet50_dc5 +status: public +tags: [] +headline: Transformer based object detector with ResNet50 backbone (dilated C5 stage). +domain: Computer Vision +description: DETR is a machine learning model that can detect objects (trained on + COCO dataset). +use_case: Object Detection +research_paper: https://arxiv.org/abs/2005.12872 +research_paper_title: End-to-End Object Detection with Transformers +license: https://github.com/facebookresearch/detr/blob/main/LICENSE +source_repo: https://github.com/facebookresearch/detr +technical_details: + Number of parameters: 41.3M + Model size: 159 MB + Model checkpoint: ResNet50-DC5 + Input resolution: 480x480 +applicable_scenarios: +- Factory Automation +- Robotic Navigation +- Camera +related_models: +- detr_resnet50 +- detr_resnet101_dc5 +- detr_resnet101 +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: + - detection-datasets/coco diff --git a/qai_hub_models/models/detr_resnet50_dc5/model.py b/qai_hub_models/models/detr_resnet50_dc5/model.py new file mode 100644 index 00000000..af80ca16 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50_dc5/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.detr.model import DETR + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "facebook/detr-resnet-50-dc5" +MODEL_ASSET_VERSION = 1 + + +class DETRResNet50DC5(DETR): + """Exportable DETR model, end-to-end.""" + + @classmethod + def from_pretrained(cls, ckpt_name: str = DEFAULT_WEIGHTS): + return DETR.from_pretrained(ckpt_name) diff --git a/qai_hub_models/models/detr_resnet50_dc5/perf.yaml b/qai_hub_models/models/detr_resnet50_dc5/perf.yaml new file mode 100644 index 00000000..4a6143b0 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50_dc5/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: DETR-ResNet50-DC5 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 760148.0 + throughput: 1.3155332908854591 + estimated_peak_memory_range: + min: 251318272 + max: 254954864 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 890 + total_layers: 890 + job_id: j1pvlr7m5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:15:27.657498Z' diff --git a/qai_hub_models/models/detr_resnet50_dc5/requirements.txt b/qai_hub_models/models/detr_resnet50_dc5/requirements.txt new file mode 100644 index 00000000..3582ec2c --- /dev/null +++ b/qai_hub_models/models/detr_resnet50_dc5/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.31.0 +timm==0.9.7 diff --git a/qai_hub_models/models/detr_resnet50_dc5/test.py b/qai_hub_models/models/detr_resnet50_dc5/test.py new file mode 100644 index 00000000..afe11871 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50_dc5/test.py @@ -0,0 +1,30 @@ +from qai_hub_models.models._shared.detr.app import DETRApp +from qai_hub_models.models.detr_resnet50_dc5.demo import MODEL_ASSET_VERSION, MODEL_ID +from qai_hub_models.models.detr_resnet50_dc5.demo import main as demo_main +from qai_hub_models.models.detr_resnet50_dc5.model import ( + DEFAULT_WEIGHTS, + DETRResNet50DC5, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "detr_test_image.jpg" +) + + +def test_task(): + net = DETRResNet50DC5.from_pretrained(DEFAULT_WEIGHTS) + img = load_image(IMAGE_ADDRESS) + _, _, label, _ = DETRApp(net).predict(img, DEFAULT_WEIGHTS) + assert set(list(label.numpy())) == {75, 63, 17} + + +def test_trace(): + net = DETRResNet50DC5.from_pretrained(DEFAULT_WEIGHTS).convert_to_torchscript() + img = load_image(IMAGE_ADDRESS) + _, _, label, _ = DETRApp(net).predict(img, DEFAULT_WEIGHTS) + assert set(list(label.numpy())) == {75, 63, 17} + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/efficientnet_b0/README.md b/qai_hub_models/models/efficientnet_b0/README.md new file mode 100644 index 00000000..855950e7 --- /dev/null +++ b/qai_hub_models/models/efficientnet_b0/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [EfficientNet-B0: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/efficientnet_b0) + +EfficientNetB0 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of EfficientNet-B0 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/efficientnet_b0). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.efficientnet_b0.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.efficientnet_b0.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of EfficientNet-B0 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py) diff --git a/qai_hub_models/models/efficientnet_b0/__init__.py b/qai_hub_models/models/efficientnet_b0/__init__.py new file mode 100644 index 00000000..1e9205a3 --- /dev/null +++ b/qai_hub_models/models/efficientnet_b0/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import EfficientNetB0 as Model # noqa: F401 diff --git a/qai_hub_models/models/efficientnet_b0/demo.py b/qai_hub_models/models/efficientnet_b0/demo.py new file mode 100644 index 00000000..4aac8191 --- /dev/null +++ b/qai_hub_models/models/efficientnet_b0/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.efficientnet_b0.model import EfficientNetB0 + + +def main(is_test: bool = False): + imagenet_demo(EfficientNetB0, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/efficientnet_b0/export.py b/qai_hub_models/models/efficientnet_b0/export.py new file mode 100644 index 00000000..829831b2 --- /dev/null +++ b/qai_hub_models/models/efficientnet_b0/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.efficientnet_b0 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "efficientnet_b0" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "efficientnet_b0", + "EfficientNet-B0", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/efficientnet_b0/info.yaml b/qai_hub_models/models/efficientnet_b0/info.yaml new file mode 100644 index 00000000..c692d842 --- /dev/null +++ b/qai_hub_models/models/efficientnet_b0/info.yaml @@ -0,0 +1,40 @@ +name: EfficientNet-B0 +# id must match with the model dir name in qai_hub_models +id: efficientnet_b0 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: EfficientNetB0 is a machine learning model that can classify images from + the Imagenet dataset. It can also be used as a backbone in building more complex + models for specific use cases. +use_case: Image Classification +tags: +- backbone +research_paper: https://arxiv.org/abs/1905.11946 +research_paper_title: 'EfficientNet: Rethinking Model Scaling for Convolutional Neural + Networks' +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py +technical_details: + Number of parameters: 5.29M + Model size: 20.2 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: +- Medical Imaging +- Anomaly Detection +- Inventory Management +related_models: +- mobilenet_v2 +- densenet121 +- googlenet +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/efficientnet_b0/model.py b/qai_hub_models/models/efficientnet_b0/model.py new file mode 100644 index 00000000..087c5067 --- /dev/null +++ b/qai_hub_models/models/efficientnet_b0/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class EfficientNetB0(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.efficientnet_b0(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/efficientnet_b0/perf.yaml b/qai_hub_models/models/efficientnet_b0/perf.yaml new file mode 100644 index 00000000..ea6a7116 --- /dev/null +++ b/qai_hub_models/models/efficientnet_b0/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: EfficientNet-B0 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 2184.0 + throughput: 457.87545787545787 + estimated_peak_memory_range: + min: 12288 + max: 2340896 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 243 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 243 + job_id: j0pxl61jp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 2166.0 + throughput: 461.6805170821791 + estimated_peak_memory_range: + min: 12288 + max: 86865200 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 242 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 242 + job_id: jo5m06zyg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:27:58.826690Z' diff --git a/qai_hub_models/models/efficientnet_b0/test.py b/qai_hub_models/models/efficientnet_b0/test.py new file mode 100644 index 00000000..0921944f --- /dev/null +++ b/qai_hub_models/models/efficientnet_b0/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.efficientnet_b0.demo import main as demo_main +from qai_hub_models.models.efficientnet_b0.model import MODEL_ID, EfficientNetB0 + + +def test_task(): + run_imagenet_classifier_test(EfficientNetB0.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(EfficientNetB0.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/esrgan/README.md b/qai_hub_models/models/esrgan/README.md new file mode 100644 index 00000000..99d8588d --- /dev/null +++ b/qai_hub_models/models/esrgan/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ESRGAN: Upscale images and remove image noise](https://aihub.qualcomm.com/models/esrgan) + +ESRGAN is a machine learning model that upscales an image with minimal loss in quality. + +This is based on the implementation of ESRGAN found +[here](https://github.com/xinntao/ESRGAN/). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/esrgan). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.esrgan.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.esrgan.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ESRGAN can be found + [here](https://github.com/xinntao/ESRGAN/blob/master/LICENSE). + + +## References +* [ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks](https://arxiv.org/abs/1809.00219) +* [Source Model Implementation](https://github.com/xinntao/ESRGAN/) diff --git a/qai_hub_models/models/esrgan/__init__.py b/qai_hub_models/models/esrgan/__init__.py new file mode 100644 index 00000000..cdf02df6 --- /dev/null +++ b/qai_hub_models/models/esrgan/__init__.py @@ -0,0 +1,3 @@ +from .app import ESRGANApp as App # noqa: F401 +from .model import ESRGAN as Model # noqa: F401 +from .model import MODEL_ID # noqa: F401 diff --git a/qai_hub_models/models/esrgan/app.py b/qai_hub_models/models/esrgan/app.py new file mode 100644 index 00000000..8807eaf9 --- /dev/null +++ b/qai_hub_models/models/esrgan/app.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from typing import List + +import numpy as np +import torch +import torchvision.transforms as transforms +from PIL.Image import Image, fromarray + + +class ESRGANApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with ESRGAN. + + The app uses 1 model: + * ESRGAN + + For a given image input, the app will: + * pre-process the image (convert to range[0, 1]) + * Run ESRGAN inference + * post-process the image + * display the input and output side-by-side + """ + + def __init__(self, esrgan_model): + self.model = esrgan_model + + def predict(self, *args, **kwargs): + # See upscale_image. + return self.upscale_image(*args, **kwargs) + + def upscale_image( + self, + pixel_values_or_image: torch.Tensor | Image | List[Image], + ) -> Image: + """ + Upscale provided images + + Parameters: + pixel_values_or_image: torch.Tensor + Input PIL image (before pre-processing) or pyTorch tensor (after image pre-processing). + + Returns: + images: List[PIL.Image.Image] + A list of upscaled images (one for each input image). + """ + + # preprocess + pixel_values = preprocess_image(pixel_values_or_image) + + # Run prediction + upscaled_image = self.model(pixel_values) + + # post-process + output_image = postprocess_image(upscaled_image) + + return output_image + + +def preprocess_image(image: Image) -> torch.Tensor: + """ + Convert a raw image to RGB and then into a normalised pyTorch tensor + that can be used as input to ESRGAN inference. + """ + transform = transforms.Compose([transforms.PILToTensor()]) # bgr image + img: torch.Tensor = transform(image) # type: ignore + img = img.float() / 255.0 # int 0 - 255 to float 0.0 - 1.0 + if img.ndimension() == 3: + img = img.unsqueeze(0) + return img + + +def postprocess_image(image: Image) -> Image: + """ + Convert from range[0, 1] to int8 values for display. + """ + output_img = np.squeeze(image) + output_img = output_img.detach().numpy().astype(float) + output_img = np.transpose(output_img, (1, 2, 0)) + output_img = np.clip(output_img * 255.0, 0, 255) + output_img = output_img.round().astype(np.uint8) + output = fromarray(output_img) + + return output diff --git a/qai_hub_models/models/esrgan/demo.py b/qai_hub_models/models/esrgan/demo.py new file mode 100644 index 00000000..96a40627 --- /dev/null +++ b/qai_hub_models/models/esrgan/demo.py @@ -0,0 +1,43 @@ +import argparse + +from qai_hub_models.models.esrgan.app import ESRGANApp +from qai_hub_models.models.esrgan.model import ESRGAN, MODEL_ASSET_VERSION, MODEL_ID +from qai_hub_models.utils.args import add_output_dir_arg +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "esrgan_demo.jpg" +) + + +# Run ESRGAN end-to-end on a sample image. +# The demo will display a image upscaled with no loss in quality. +def main(is_test: bool = False): + # Demo parameters + parser = argparse.ArgumentParser() + parser.add_argument( + "--image", + type=str, + default=IMAGE_ADDRESS, + help="image file path or URL.", + ) + add_output_dir_arg(parser) + + args = parser.parse_args([] if is_test else None) + + # Load image & model + app = ESRGANApp(ESRGAN.from_pretrained()) + image = load_image(args.image) + pred_image = app.upscale_image(image) + if not is_test: + display_or_save_image( + image, args.output_dir, "original_image.png", "original image" + ) + display_or_save_image( + pred_image, args.output_dir, "upscaled_image.png", "upscaled image" + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/esrgan/export.py b/qai_hub_models/models/esrgan/export.py new file mode 100644 index 00000000..0237f2af --- /dev/null +++ b/qai_hub_models/models/esrgan/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.esrgan import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "esrgan" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "esrgan", + "ESRGAN", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/esrgan/info.yaml b/qai_hub_models/models/esrgan/info.yaml new file mode 100644 index 00000000..2dc08ffd --- /dev/null +++ b/qai_hub_models/models/esrgan/info.yaml @@ -0,0 +1,33 @@ +name: ESRGAN +# id must match with the model dir name in qai_hub_models +id: esrgan +status: public +headline: Upscale images and remove image noise. +domain: Computer Vision +description: ESRGAN is a machine learning model that upscales an image with minimal + loss in quality. +use_case: Super Resolution +tags: [] +research_paper: https://arxiv.org/abs/1809.00219 +research_paper_title: 'ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks' +license: https://github.com/xinntao/ESRGAN/blob/master/LICENSE +source_repo: https://github.com/xinntao/ESRGAN/ +technical_details: + Number of parameters: 16.7M + Model size: 64.0 MB + Model checkpoint: ESRGAN_x4 + Input resolution: 128x128 +applicable_scenarios: +- Virtual Real Estate Tours +- Gaming +- ARVR +form_factors: +- Phone +- Tablet +related_models: +- real_esrgan_general_x4v3 +- real_esrgan_x4plus +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: [] diff --git a/qai_hub_models/models/esrgan/model.py b/qai_hub_models/models/esrgan/model.py new file mode 100644 index 00000000..880eb4cd --- /dev/null +++ b/qai_hub_models/models/esrgan/model.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.evaluators.superres_evaluator import SuperResolutionOutputEvaluator +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, SourceAsRoot +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +ESRGAN_SOURCE_REPOSITORY = "https://github.com/xinntao/ESRGAN" +ESRGAN_SOURCE_REPO_COMMIT = "73e9b634cf987f5996ac2dd33f4050922398a921" +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_WEIGHTS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "RRDB_ESRGAN_x4.pth" +) +SCALING_FACTOR = 4 + + +class ESRGAN(BaseModel): + """Exportable ESRGAN super resolution applications, end-to-end.""" + + def __init__( + self, + esrgan_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = esrgan_model + + @classmethod + def from_pretrained(cls, weights_path: str | None = None) -> ESRGAN: + """Load ESRGAN from a weightfile created by the source ESRGAN repository.""" + + # Load PyTorch model from disk + esrgan_model = _load_esrgan_source_model_from_weights(weights_path) + + return cls(esrgan_model) + + def get_evaluator(self) -> BaseEvaluator: + return SuperResolutionOutputEvaluator() + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run ESRGAN on `image`, and produce an upscaled image + + Parameters: + image: Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + image: Pixel values + Range: float[0, 1] + 3-channel Color Space: RGB + """ + return self.model(image) + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 128, + width: int = 128, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} + + +def _load_esrgan_source_model_from_weights( + weights_path: str | None = None, +) -> torch.nn.Module: + # Load ESRGAN model from the source repository using the given weights. + with SourceAsRoot( + ESRGAN_SOURCE_REPOSITORY, + ESRGAN_SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + ): + # download the weights file + if not weights_path: + weights_path = DEFAULT_WEIGHTS.fetch() + print(f"Weights file downloaded as {weights_path}") + + # necessary import. `esrgan.RRDBNet_arch` comes from the esrgan repo. + import RRDBNet_arch as arch + + esrgan_model = arch.RRDBNet(3, 3, 64, 23, gc=32) + esrgan_model.load_state_dict( + torch.load(weights_path, map_location=torch.device("cpu")), strict=True + ) + return esrgan_model diff --git a/qai_hub_models/models/esrgan/perf.yaml b/qai_hub_models/models/esrgan/perf.yaml new file mode 100644 index 00000000..50e0e2bd --- /dev/null +++ b/qai_hub_models/models/esrgan/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: ESRGAN + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 76337.0 + throughput: 13.099807432830737 + estimated_peak_memory_range: + min: 3301376 + max: 6221192 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1024 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1024 + job_id: jnp1nw7kg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 66070.0 + throughput: 15.135462388375965 + estimated_peak_memory_range: + min: 102400 + max: 101973424 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1027 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1027 + job_id: jvgddq8kg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:29:43.305116Z' diff --git a/qai_hub_models/models/esrgan/test.py b/qai_hub_models/models/esrgan/test.py new file mode 100644 index 00000000..56577987 --- /dev/null +++ b/qai_hub_models/models/esrgan/test.py @@ -0,0 +1,45 @@ +import numpy as np + +from qai_hub_models.models.esrgan.app import ESRGANApp +from qai_hub_models.models.esrgan.demo import IMAGE_ADDRESS +from qai_hub_models.models.esrgan.demo import main as demo_main +from qai_hub_models.models.esrgan.model import ESRGAN, MODEL_ASSET_VERSION, MODEL_ID +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "esrgan_demo_output.png" +) + + +@skip_clone_repo_check +def test_esrgan_app(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = ESRGANApp(ESRGAN.from_pretrained()) + app_output_image = app.upscale_image(image) + np.testing.assert_allclose( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_esrgan_trace(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = ESRGANApp(ESRGAN.from_pretrained().convert_to_torchscript()) + app_output_image = app.upscale_image(image) + np.testing.assert_allclose( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/facebook_denoiser/README.md b/qai_hub_models/models/facebook_denoiser/README.md new file mode 100644 index 00000000..b92ce796 --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Facebook-Denoiser: Real-time speech denoising optimized for mobile and edge](https://aihub.qualcomm.com/models/facebook_denoiser) + +Facebook Denoiser is a machine learning model that can denoise & isolate voices in sound clips. + +This is based on the implementation of Facebook-Denoiser found +[here](https://github.com/facebookresearch/denoiser). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/facebook_denoiser). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[facebook_denoiser]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.facebook_denoiser.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.facebook_denoiser.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Facebook-Denoiser can be found + [here](https://github.com/facebookresearch/denoiser/blob/main/LICENSE). + + +## References +* [Real Time Speech Enhancement in the Waveform Domain](https://arxiv.org/abs/2006.12847) +* [Source Model Implementation](https://github.com/facebookresearch/denoiser) diff --git a/qai_hub_models/models/facebook_denoiser/__init__.py b/qai_hub_models/models/facebook_denoiser/__init__.py new file mode 100644 index 00000000..8fe05b62 --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/__init__.py @@ -0,0 +1,3 @@ +from .app import FacebookDenoiserApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import FacebookDenoiser as Model # noqa: F401 diff --git a/qai_hub_models/models/facebook_denoiser/app.py b/qai_hub_models/models/facebook_denoiser/app.py new file mode 100644 index 00000000..bb5390d3 --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/app.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import Callable, List, Sequence + +import numpy as np +import torch +import torchaudio + +from qai_hub_models.models.facebook_denoiser.model import SAMPLE_RATE + + +class FacebookDenoiserApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with Facebook Denoiser. + + For a given audio input, the app will: + * load the audio from the source wav file + * call the denoiser + * save the denoised audio back to a different wav file + """ + + def __init__( + self, + denoiser: Callable[[torch.Tensor], torch.Tensor], + sample_rate: int = SAMPLE_RATE, + ): + self.denoiser = denoiser + self.sample_rate = sample_rate + + def predict(self, *args, **kwargs): + """See FacebookDenoiserApp::denoise_audio for interface documentation.""" + return self.denoise_audio(*args, **kwargs) + + def denoise_audio( + self, + input_audio: Sequence[Path | str | torch.Tensor | np.ndarray], + out_dir: Path | str | None = None, + ) -> List[Path | torch.Tensor]: + """ + Denoise and isolate the speech in the provided audio clip(s). + + Parameters: + input_audio: List[Path | str | torch.Tensor | np.ndarray] + A list of paths (to .wav files), or loaded audio in torch Tensor / numpy format. + Tensors must be shape [2, sample_rate * length of recording in seconds]. + All audio must have the same sample rate the model was trained on. + + out_dir: bool + If: + * this is set to a folder, AND + * all of input_audio are file paths + Then a list of saved .wav file paths will be returned. + + Otherwise, the method will return a list of predicted WAV audio tensors. + + Returns: + Predicted audio. See `raw_output` parameter above for type of return value. + """ + with torch.no_grad(): + all_inputs_are_paths = True + + noisy_audios = [] + for audio in input_audio: + if isinstance(audio, str) or isinstance(audio, Path): + audio, sample_rate = torchaudio.load(audio) + assert sample_rate == self.sample_rate + else: + all_inputs_are_paths = False + if isinstance(audio, np.ndarray): + audio = torch.from_numpy(audio) + noisy_audios.append(audio) + + estimates = [] + for noisy in noisy_audios: + out = self.denoiser(noisy) + out = out / max(out.abs().max().item(), 1) # Normalize + if all_inputs_are_paths and out_dir: + # We don't run files in batches, take the first batch output + out = out[:, 0] + estimates.append(out) + + if out_dir and all_inputs_are_paths: + output_files = [] + for path, estimate in zip(input_audio, estimates): + filename = os.path.join( + out_dir, os.path.basename(path).rsplit(".", 1)[0] + ) + filename = Path(f"{filename}_enhanced.wav") + torchaudio.save(filename, estimate, self.sample_rate) + output_files.append(filename) + return output_files + return estimates diff --git a/qai_hub_models/models/facebook_denoiser/demo.py b/qai_hub_models/models/facebook_denoiser/demo.py new file mode 100644 index 00000000..b5139cae --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/demo.py @@ -0,0 +1,65 @@ +import os +import tempfile +from typing import List + +from qai_hub_models.models.facebook_denoiser.app import FacebookDenoiserApp +from qai_hub_models.models.facebook_denoiser.model import ( + ASSET_VERSION, + MODEL_ID, + SAMPLE_RATE, + FacebookDenoiser, +) +from qai_hub_models.utils.args import get_model_cli_parser, model_from_cli_args +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_path + +EXAMPLE_RECORDING = CachedWebModelAsset.from_asset_store( + MODEL_ID, ASSET_VERSION, "icsi_meeting_recording.wav" +) + + +def main(is_test: bool = False): + """ + Run facebook denoiser on a sample audio (`.wav`) file. + """ + parser = get_model_cli_parser(FacebookDenoiser) + parser.add_argument( + "--audio", + nargs="+", + default=[EXAMPLE_RECORDING], + help="WAV file paths or URLs", + ) + parser.add_argument( + "--sample_rate", + type=int, + default=SAMPLE_RATE, + help="Audio sample rate the model was trained on", + ) + parser.add_argument( + "--output_dir", + type=str, + default=os.getcwd(), + help="output directory (where output WAV should be written)", + ) + args = parser.parse_args([] if is_test else None) + + # Load Model + source_model = model_from_cli_args(FacebookDenoiser, args) + app = FacebookDenoiserApp(source_model, args.sample_rate) + + # Download data + audio: List[str] = args.audio + with tempfile.TemporaryDirectory() as tmpdir: + for idx, file in enumerate(audio): + audio[idx] = load_path(file, tmpdir) + + # Dump output from app + output = app.denoise_audio(audio, args.output_dir) + + if not is_test: + print("Wrote files:") + for path in output: + print(str(path)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/facebook_denoiser/export.py b/qai_hub_models/models/facebook_denoiser/export.py new file mode 100644 index 00000000..e5b6c0d4 --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/export.py @@ -0,0 +1,177 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.facebook_denoiser import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "facebook_denoiser" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "facebook_denoiser", + "Facebook-Denoiser", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=sample_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/facebook_denoiser/info.yaml b/qai_hub_models/models/facebook_denoiser/info.yaml new file mode 100644 index 00000000..28b16cc0 --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/info.yaml @@ -0,0 +1,30 @@ +name: Facebook-Denoiser +# id must match with the model dir name in qai_hub_models +id: facebook_denoiser +status: public +headline: Real-time speech denoising optimized for mobile and edge. +domain: Audio +description: Facebook Denoiser is a machine learning model that can denoise & isolate + voices in sound clips. +use_case: Audio Enhancement +tags: [] +research_paper: https://arxiv.org/abs/2006.12847 +research_paper_title: Real Time Speech Enhancement in the Waveform Domain +license: https://github.com/facebookresearch/denoiser/blob/main/LICENSE +source_repo: https://github.com/facebookresearch/denoiser +technical_details: + Number of parameters: 18.9M + Model size: 72.0 MB + Input resolution: 1x1x917 +applicable_scenarios: +- Gaming +- Voice Calling +form_factors: +- Phone +- Tablet +- IoT +related_models: [] +has_static_banner: yes +has_animated_banner: yes +license_type: cc-by-nc-4.0 +dataset: [] diff --git a/qai_hub_models/models/facebook_denoiser/model.py b/qai_hub_models/models/facebook_denoiser/model.py new file mode 100644 index 00000000..de59f70b --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/model.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +import torch +from denoiser import pretrained +from denoiser.pretrained import DNS_48_URL + +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +SAMPLE_RATE = 16000 +HIDDEN_LAYER_COUNT = 48 +DEFAULT_SEQUENCE_LENGTH = 917 +MODEL_ID = "facebook_denoiser" +ASSET_VERSION = 1 + + +class FacebookDenoiser(BaseModel): + def __init__(self, net: torch.nn.Module): + """ + Basic initializer which takes in a pretrained Facebook DNS network. + """ + super().__init__() + self.net = net + + def forward(self, audio: torch.Tensor) -> torch.Tensor: + """ + Predict denoised audio from noisy input audio. + + Parameters: + audio: A [NUM_SOUND_CHANNELS, BATCH, SEQ_LEN] or [NUM_SOUND_CHANNELS, SEQ_LEN] audio snippet. + SEQ_LEN == AUDIO_SAMPLE_RATE * AUDIO_LENGTH_IN_SECONDS + + Returns: + audio: A [NUM_SOUND_CHANNELS, BATCH, SEQ_LEN] denoised audio snippet. + """ + return self.net(audio) + + def get_input_spec( + self, + batch_size: int = 1, + sequence_length: int = DEFAULT_SEQUENCE_LENGTH, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + return {"audio": ((batch_size, 1, sequence_length), "float32")} + + @classmethod + def from_pretrained( + cls, state_dict_url: str = DNS_48_URL, hidden_layer_count=HIDDEN_LAYER_COUNT + ) -> FacebookDenoiser: + net = pretrained._demucs( + state_dict_url is not None, state_dict_url, hidden=hidden_layer_count + ) + return cls(net) diff --git a/qai_hub_models/models/facebook_denoiser/perf.yaml b/qai_hub_models/models/facebook_denoiser/perf.yaml new file mode 100644 index 00000000..4f7e4ae7 --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Facebook-Denoiser + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 6985.0 + throughput: 143.16392269148176 + estimated_peak_memory_range: + min: 28246016 + max: 51679504 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 209 + total_layers: 209 + job_id: jn5qlrw7p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:10:37.856306Z' diff --git a/qai_hub_models/models/facebook_denoiser/requirements.txt b/qai_hub_models/models/facebook_denoiser/requirements.txt new file mode 100644 index 00000000..0307e34c --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/requirements.txt @@ -0,0 +1,3 @@ +denoiser +torchaudio +PySoundFile; sys_platform == 'win32' diff --git a/qai_hub_models/models/facebook_denoiser/test.py b/qai_hub_models/models/facebook_denoiser/test.py new file mode 100644 index 00000000..417e3f0f --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/test.py @@ -0,0 +1,61 @@ +import pytest +import torch +import torchaudio + +from qai_hub_models.models.facebook_denoiser.app import FacebookDenoiserApp +from qai_hub_models.models.facebook_denoiser.demo import EXAMPLE_RECORDING +from qai_hub_models.models.facebook_denoiser.demo import main as demo_main +from qai_hub_models.models.facebook_denoiser.model import ( + ASSET_VERSION, + MODEL_ID, + SAMPLE_RATE, + FacebookDenoiser, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +ENHANCED_EXAMPLE_RECORDING = CachedWebModelAsset.from_asset_store( + MODEL_ID, ASSET_VERSION, "icsi_meeting_recording_enhanced.wav" +) + + +def _handle_runtime_error(e: RuntimeError): + if "Couldn't find appropriate backend to handle uri" not in str(e): + raise e + print( + "You're missing either FFMPEG on Linux (apt-get install ffmpeg) or PySoundFile on Windows (pip install PySoundFile)" + ) + + +def test_task(): + app = FacebookDenoiserApp(FacebookDenoiser.from_pretrained()) + try: + out = app.predict([EXAMPLE_RECORDING.fetch()])[0][:, 0] + except RuntimeError as e: + _handle_runtime_error(e) + return + expected, _ = torchaudio.load(ENHANCED_EXAMPLE_RECORDING.fetch()) + torch.testing.assert_allclose(out, expected) + + +@pytest.mark.skip(reason="Fails with a mysterious error in DefaultCPUAllocator.") +def test_trace(): + try: + input_data, sample_rate = torchaudio.load(EXAMPLE_RECORDING.fetch()) + assert sample_rate == SAMPLE_RATE + batch_size, sequence_length = input_data.shape + input_data = input_data.unsqueeze(1) + + model = FacebookDenoiser.from_pretrained() + input_spec = model.get_input_spec(sequence_length, batch_size) + app = FacebookDenoiserApp(model.convert_to_torchscript(input_spec)) + out = app.predict([input_data])[0][:, 0] + except RuntimeError as e: + _handle_runtime_error(e) + return + + expected, _ = torchaudio.load(ENHANCED_EXAMPLE_RECORDING.fetch()) + torch.testing.assert_allclose(out, expected) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/fastsam_s/README.md b/qai_hub_models/models/fastsam_s/README.md new file mode 100644 index 00000000..ef56d369 --- /dev/null +++ b/qai_hub_models/models/fastsam_s/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FastSam-S: Generate high quality segmentation mask on device](https://aihub.qualcomm.com/models/fastsam_s) + +The Fast Segment Anything Model (FastSAM) is a novel, real-time CNN-based solution for the Segment Anything task. This task is designed to segment any object within an image based on various possible user interaction prompts. The model performs competitively despite significantly reduced computation, making it a practical choice for a variety of vision tasks. + +This is based on the implementation of FastSam-S found +[here](https://github.com/CASIA-IVA-Lab/FastSAM). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/fastsam_s). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[fastsam_s]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.fastsam_s.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.fastsam_s.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FastSam-S can be found + [here](https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/LICENSE). + + +## References +* [Fast Segment Anything](https://arxiv.org/abs/2306.12156) +* [Source Model Implementation](https://github.com/CASIA-IVA-Lab/FastSAM) diff --git a/qai_hub_models/models/fastsam_s/__init__.py b/qai_hub_models/models/fastsam_s/__init__.py new file mode 100644 index 00000000..3043f2c5 --- /dev/null +++ b/qai_hub_models/models/fastsam_s/__init__.py @@ -0,0 +1,4 @@ +from qai_hub_models.models._shared.fastsam.app import FastSAMApp as App # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import FastSAM_S as Model # noqa: F401 diff --git a/qai_hub_models/models/fastsam_s/demo.py b/qai_hub_models/models/fastsam_s/demo.py new file mode 100644 index 00000000..f0131fa6 --- /dev/null +++ b/qai_hub_models/models/fastsam_s/demo.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.fastsam.demo import fastsam_demo +from qai_hub_models.models.fastsam_s.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + FastSAM_S, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +INPUT_IMAGE = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "image_640.jpg" +) + + +def main(is_test: bool = False): + fastsam_demo(FastSAM_S, image_path=INPUT_IMAGE, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/fastsam_s/export.py b/qai_hub_models/models/fastsam_s/export.py new file mode 100644 index 00000000..0bf64630 --- /dev/null +++ b/qai_hub_models/models/fastsam_s/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.fastsam_s import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "fastsam_s" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "fastsam_s", + "FastSam-S", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace( + model, make_torch_inputs(input_spec), check_trace=False + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_1,output_2,output_3,output_5", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_1,output_2,output_3,output_5", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/fastsam_s/info.yaml b/qai_hub_models/models/fastsam_s/info.yaml new file mode 100644 index 00000000..bca5b355 --- /dev/null +++ b/qai_hub_models/models/fastsam_s/info.yaml @@ -0,0 +1,32 @@ +name: FastSam-S +# id must match with the model dir name in qai_hub_models +id: fastsam_s +status: public +headline: Generate high quality segmentation mask on device. +domain: Computer Vision +description: The Fast Segment Anything Model (FastSAM) is a novel, real-time CNN-based solution for the Segment Anything task. This task is designed to segment any object within an image based on various possible user interaction prompts. The model performs competitively despite significantly reduced computation, making it a practical choice for a variety of vision tasks. +use_case: Semantic Segmentation +tags: [] +research_paper: https://arxiv.org/abs/2306.12156 +research_paper_title: Fast Segment Anything +license: https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/LICENSE +source_repo: https://github.com/CASIA-IVA-Lab/FastSAM +technical_details: + Number of parameters: 11.5M + Model size: 48 MB + Model checkpoint: fastsam-s.pt + Inference latency: RealTime + Input resolution: 640x640 +applicable_scenarios: + - Camera + - Photo Editing +related_models: + - 'sam' + - 'fastsam_x' +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: agpl-3.0 +dataset: [] diff --git a/qai_hub_models/models/fastsam_s/model.py b/qai_hub_models/models/fastsam_s/model.py new file mode 100644 index 00000000..460d276c --- /dev/null +++ b/qai_hub_models/models/fastsam_s/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.fastsam.model import Fast_SAM + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "FastSAM-s.pt" +MODEL_ASSET_VERSION = 1 + + +class FastSAM_S(Fast_SAM): + """Exportable FastSAM model, end-to-end.""" + + @classmethod + def from_pretrained(cls, ckpt_name: str = DEFAULT_WEIGHTS): + return Fast_SAM.from_pretrained(ckpt_name) diff --git a/qai_hub_models/models/fastsam_s/perf.yaml b/qai_hub_models/models/fastsam_s/perf.yaml new file mode 100644 index 00000000..ca26141a --- /dev/null +++ b/qai_hub_models/models/fastsam_s/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FastSam-S + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 13071.0 + throughput: 76.50524060898171 + estimated_peak_memory_range: + min: 7827456 + max: 10814968 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 288 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 288 + job_id: jn5qlr97p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:18:24.085348Z' diff --git a/qai_hub_models/models/fastsam_s/requirements.txt b/qai_hub_models/models/fastsam_s/requirements.txt new file mode 100644 index 00000000..8d55bfa4 --- /dev/null +++ b/qai_hub_models/models/fastsam_s/requirements.txt @@ -0,0 +1,2 @@ +ultralytics==8.0.193 +torchvision diff --git a/qai_hub_models/models/fastsam_s/test.py b/qai_hub_models/models/fastsam_s/test.py new file mode 100644 index 00000000..3174354a --- /dev/null +++ b/qai_hub_models/models/fastsam_s/test.py @@ -0,0 +1,30 @@ +import numpy as np +from PIL import Image +from ultralytics.models.fastsam import FastSAM, FastSAMPrompt + +from qai_hub_models.models._shared.fastsam.app import FastSAMApp +from qai_hub_models.models.fastsam_s.demo import INPUT_IMAGE +from qai_hub_models.models.fastsam_s.demo import main as demo_main +from qai_hub_models.models.fastsam_s.model import DEFAULT_WEIGHTS, FastSAM_S +from qai_hub_models.utils.image_processing import preprocess_PIL_image + + +def test_task(): + image_path = INPUT_IMAGE.fetch() + image = Image.open(image_path) + image = preprocess_PIL_image(image) + app = FastSAMApp(FastSAM_S.from_pretrained()) + result, _ = app.segment_image(str(image_path)) + + model = FastSAM(DEFAULT_WEIGHTS) + everything_results = model( + image_path, device="cpu", retina_masks=True, imgsz=640, conf=0.4, iou=0.9 + ) + prompt = FastSAMPrompt(image_path, everything_results, device="cpu") + predictions = prompt.everything_prompt() + + assert np.allclose(result[0].masks.data, predictions[0].masks.data) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/fastsam_x/README.md b/qai_hub_models/models/fastsam_x/README.md new file mode 100644 index 00000000..7b42289b --- /dev/null +++ b/qai_hub_models/models/fastsam_x/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FastSam-X: Generate high quality segmentation mask on device](https://aihub.qualcomm.com/models/fastsam_x) + +The Fast Segment Anything Model (FastSAM) is a novel, real-time CNN-based solution for the Segment Anything task. This task is designed to segment any object within an image based on various possible user interaction prompts. The model performs competitively despite significantly reduced computation, making it a practical choice for a variety of vision tasks. + +This is based on the implementation of FastSam-X found +[here](https://github.com/CASIA-IVA-Lab/FastSAM). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/fastsam_x). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[fastsam_x]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.fastsam_x.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.fastsam_x.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FastSam-X can be found + [here](https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/LICENSE). + + +## References +* [Fast Segment Anything](https://arxiv.org/abs/2306.12156) +* [Source Model Implementation](https://github.com/CASIA-IVA-Lab/FastSAM) diff --git a/qai_hub_models/models/fastsam_x/__init__.py b/qai_hub_models/models/fastsam_x/__init__.py new file mode 100644 index 00000000..63a9e79d --- /dev/null +++ b/qai_hub_models/models/fastsam_x/__init__.py @@ -0,0 +1,4 @@ +from qai_hub_models.models._shared.fastsam.app import FastSAMApp as App # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import FastSAM_X as Model # noqa: F401 diff --git a/qai_hub_models/models/fastsam_x/demo.py b/qai_hub_models/models/fastsam_x/demo.py new file mode 100644 index 00000000..263dbb07 --- /dev/null +++ b/qai_hub_models/models/fastsam_x/demo.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.fastsam.demo import fastsam_demo +from qai_hub_models.models.fastsam_x.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + FastSAM_X, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +INPUT_IMAGE = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "image_640.jpg" +) + + +def main(is_test: bool = False): + fastsam_demo(FastSAM_X, image_path=INPUT_IMAGE, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/fastsam_x/export.py b/qai_hub_models/models/fastsam_x/export.py new file mode 100644 index 00000000..a59110db --- /dev/null +++ b/qai_hub_models/models/fastsam_x/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.fastsam_x import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "fastsam_x" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "fastsam_x", + "FastSam-X", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace( + model, make_torch_inputs(input_spec), check_trace=False + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_1,output_2,output_3,output_5", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_1,output_2,output_3,output_5", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/fastsam_x/info.yaml b/qai_hub_models/models/fastsam_x/info.yaml new file mode 100644 index 00000000..2cdc0b3b --- /dev/null +++ b/qai_hub_models/models/fastsam_x/info.yaml @@ -0,0 +1,32 @@ +name: FastSam-X +# id must match with the model dir name in qai_hub_models +id: fastsam_x +status: public +headline: Generate high quality segmentation mask on device. +domain: Computer Vision +description: The Fast Segment Anything Model (FastSAM) is a novel, real-time CNN-based solution for the Segment Anything task. This task is designed to segment any object within an image based on various possible user interaction prompts. The model performs competitively despite significantly reduced computation, making it a practical choice for a variety of vision tasks. +use_case: Semantic Segmentation +tags: [] +research_paper: https://arxiv.org/abs/2306.12156 +research_paper_title: Fast Segment Anything +license: https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/LICENSE +source_repo: https://github.com/CASIA-IVA-Lab/FastSAM +technical_details: + Number of parameters: 68M + Model size: 290 MB + Model checkpoint: fastsam-x.pt + Inference latency: RealTime + Input resolution: 640x640 +applicable_scenarios: + - Camera + - Photo Editing +related_models: + - 'sam' + - 'fastsam_s' +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: agpl-3.0 +dataset: [] diff --git a/qai_hub_models/models/fastsam_x/model.py b/qai_hub_models/models/fastsam_x/model.py new file mode 100644 index 00000000..707bd909 --- /dev/null +++ b/qai_hub_models/models/fastsam_x/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.fastsam.model import Fast_SAM + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "FastSAM-x.pt" +MODEL_ASSET_VERSION = 1 + + +class FastSAM_X(Fast_SAM): + """Exportable FastSAM model, end-to-end.""" + + @classmethod + def from_pretrained(cls, ckpt_name: str = DEFAULT_WEIGHTS): + return Fast_SAM.from_pretrained(ckpt_name) diff --git a/qai_hub_models/models/fastsam_x/perf.yaml b/qai_hub_models/models/fastsam_x/perf.yaml new file mode 100644 index 00000000..3fa52d1d --- /dev/null +++ b/qai_hub_models/models/fastsam_x/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FastSam-X + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 64468.0 + throughput: 15.511571632437798 + estimated_peak_memory_range: + min: 9224192 + max: 14449200 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 420 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 420 + job_id: jz5wl3xzp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:38:35.191434Z' diff --git a/qai_hub_models/models/fastsam_x/requirements.txt b/qai_hub_models/models/fastsam_x/requirements.txt new file mode 100644 index 00000000..8d55bfa4 --- /dev/null +++ b/qai_hub_models/models/fastsam_x/requirements.txt @@ -0,0 +1,2 @@ +ultralytics==8.0.193 +torchvision diff --git a/qai_hub_models/models/fastsam_x/test.py b/qai_hub_models/models/fastsam_x/test.py new file mode 100644 index 00000000..83545e04 --- /dev/null +++ b/qai_hub_models/models/fastsam_x/test.py @@ -0,0 +1,30 @@ +import numpy as np +from PIL import Image +from ultralytics.models.fastsam import FastSAM, FastSAMPrompt + +from qai_hub_models.models._shared.fastsam.app import FastSAMApp +from qai_hub_models.models.fastsam_x.demo import INPUT_IMAGE +from qai_hub_models.models.fastsam_x.demo import main as demo_main +from qai_hub_models.models.fastsam_x.model import DEFAULT_WEIGHTS, FastSAM_X +from qai_hub_models.utils.image_processing import preprocess_PIL_image + + +def test_task(): + image_path = INPUT_IMAGE.fetch() + image = Image.open(image_path) + image = preprocess_PIL_image(image) + app = FastSAMApp(FastSAM_X.from_pretrained()) + result, _ = app.segment_image(str(image_path)) + + model = FastSAM(DEFAULT_WEIGHTS) + everything_results = model( + image_path, device="cpu", retina_masks=True, imgsz=640, conf=0.4, iou=0.9 + ) + prompt = FastSAMPrompt(image_path, everything_results, device="cpu") + predictions = prompt.everything_prompt() + + assert np.allclose(result[0].masks.data, predictions[0].masks.data) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/fcn_resnet50/README.md b/qai_hub_models/models/fcn_resnet50/README.md new file mode 100644 index 00000000..3763ff51 --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FCN_ResNet50: Fully-convolutional network model for image segmentation](https://aihub.qualcomm.com/models/fcn_resnet50) + +FCN_ResNet50 is a machine learning model that can segment images from the COCO dataset. It uses ResNet50 as a backbone. + +This is based on the implementation of FCN_ResNet50 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/fcn_resnet50). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.fcn_resnet50.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.fcn_resnet50.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FCN_ResNet50 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Fully Convolutional Networks for Semantic Segmentation](https://arxiv.org/abs/1411.4038) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py) diff --git a/qai_hub_models/models/fcn_resnet50/__init__.py b/qai_hub_models/models/fcn_resnet50/__init__.py new file mode 100644 index 00000000..96cbd29e --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50/__init__.py @@ -0,0 +1,3 @@ +from .app import FCN_ResNet50App as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import FCN_ResNet50 as Model # noqa: F401 diff --git a/qai_hub_models/models/fcn_resnet50/app.py b/qai_hub_models/models/fcn_resnet50/app.py new file mode 100644 index 00000000..24d829e8 --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50/app.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from collections import OrderedDict +from typing import Callable + +import numpy as np +import PIL +import torch +from PIL.Image import Image +from torchvision import transforms + +from qai_hub_models.models.fcn_resnet50.model import NUM_CLASSES +from qai_hub_models.utils.draw import create_color_map +from qai_hub_models.utils.image_processing import normalize_image_transform + + +def preprocess_image(image: Image) -> torch.Tensor: + """ + Preprocesses images to be run through torch FCN segmenter + as prescribed here: + https://pytorch.org/hub/pytorch_vision_resnet/ + + Parameters: + image: Input image to be run through the classifier model. + + Returns: + torch tensor to be directly passed to the model. + """ + transform = transforms.Compose( + [ + transforms.ToTensor(), + normalize_image_transform(), + ] + ) + out_tensor: torch.Tensor = transform(image) # type: ignore + return out_tensor.unsqueeze(0) + + +class FCN_ResNet50App: + """ + This class consists of light-weight "app code" that is required to + perform end to end inference with FCN_ResNet50. + + For a given image input, the app will: + * Pre-process the image (normalize) + * Run image segmentation + * Convert the raw output into probabilities using softmax + """ + + def __init__(self, model: Callable[[torch.Tensor], OrderedDict]): + self.model = model + + def predict(self, image: Image, raw_output: bool = False) -> Image | np.ndarray: + """ + From the provided image or tensor, segment the image + + Parameters: + image: A PIL Image in RGB format. + + Returns: + If raw_output is true, returns: + masks: np.ndarray + A list of predicted masks. + + Otherwise, returns: + segmented_images: List[PIL.Image] + Images with segmentation map overlaid with an alpha of 0.5. + """ + + input_tensor = preprocess_image(image) + with torch.no_grad(): + output = self.model(input_tensor) + output = output[0] + predictions = output.argmax(0).byte().cpu().numpy() + + if raw_output: + return predictions + + color_map = create_color_map(NUM_CLASSES) + out = PIL.Image.blend(image, PIL.Image.fromarray(color_map[predictions]), 0.5) + + return out diff --git a/qai_hub_models/models/fcn_resnet50/demo.py b/qai_hub_models/models/fcn_resnet50/demo.py new file mode 100644 index 00000000..097eea4b --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50/demo.py @@ -0,0 +1,52 @@ +from qai_hub_models.models.fcn_resnet50.app import FCN_ResNet50App +from qai_hub_models.models.fcn_resnet50.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + FCN_ResNet50, +) +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + +# Demo image comes from https://github.com/pytorch/hub/raw/master/images/deeplab1.png +# and has had alpha channel removed for use as input +INPUT_IMAGE_LOCAL_PATH = "fcn_demo.png" +INPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, INPUT_IMAGE_LOCAL_PATH +) + + +def main(is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(FCN_ResNet50) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=INPUT_IMAGE_ADDRESS, + help="image file path or URL.", + ) + + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, FCN_ResNet50.get_model_id()) + model = demo_model_from_cli_args(FCN_ResNet50, args) + + # This FCN ResNet 50 demo comes from + # https://pytorch.org/hub/pytorch_vision_fcn_resnet101/ + # load image and model + image = load_image(args.image) + input_image = image.convert("RGB") + app = FCN_ResNet50App(model) + output = app.predict(input_image, False) + + if not is_test: + display_or_save_image(output, args.output_dir, "fcn_demo_output.png") + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/fcn_resnet50/export.py b/qai_hub_models/models/fcn_resnet50/export.py new file mode 100644 index 00000000..e1150b52 --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50/export.py @@ -0,0 +1,193 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.fcn_resnet50 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "fcn_resnet50" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "fcn_resnet50", + "FCN_ResNet50", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/fcn_resnet50/info.yaml b/qai_hub_models/models/fcn_resnet50/info.yaml new file mode 100644 index 00000000..18a80b2a --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50/info.yaml @@ -0,0 +1,34 @@ +name: FCN_ResNet50 +# id must match with the model dir name in qai_hub_models +id: fcn_resnet50 +status: public +headline: Fully-convolutional network model for image segmentation. +domain: Computer Vision +use_case: Semantic Segmentation +description: FCN_ResNet50 is a machine learning model that can segment images from the COCO dataset. It uses ResNet50 as a backbone. +tags: [] +research_paper: https://arxiv.org/abs/1411.4038 +research_paper_title: Fully Convolutional Networks for Semantic Segmentation +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py +technical_details: + Number of parameters: 35M + Model size: 157 MB + Model checkpoint: COCO_WITH_VOC_LABELS_V1 + Input resolution: 224x224 +applicable_scenarios: + - Anomaly Detection + - Inventory Management +related_models: + - 'sam' + - unet_segmentation + - ddrnet23_slim +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: no +license_type: bsd-3-clause +dataset: [] diff --git a/qai_hub_models/models/fcn_resnet50/model.py b/qai_hub_models/models/fcn_resnet50/model.py new file mode 100644 index 00000000..a099fbf6 --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50/model.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import torch +import torchvision.models as tv_models + +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_WEIGHTS = "COCO_WITH_VOC_LABELS_V1" +NUM_CLASSES = 21 + + +class FCN_ResNet50(BaseModel): + """Exportable FCNresNet50 image segmentation applications, end-to-end.""" + + def __init__( + self, + fcn_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = fcn_model + + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> FCN_ResNet50: + model = tv_models.segmentation.fcn_resnet50(weights=weights).eval() + return cls(model) + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run FCN_ResNet50 on `image`, and produce a tensor of classes for segmentation + + Parameters: + image: Pixel values pre-processed for model consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + tensor: 1x21xHxW tensor of class logits per pixel + """ + return self.model(image)["out"] + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 224, + width: int = 224, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} diff --git a/qai_hub_models/models/fcn_resnet50/perf.yaml b/qai_hub_models/models/fcn_resnet50/perf.yaml new file mode 100644 index 00000000..7c674dea --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FCN_ResNet50 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 8563.0 + throughput: 116.78150181011328 + estimated_peak_memory_range: + min: 4263936 + max: 11057224 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 84 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 84 + job_id: joprl21vp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 7864.0 + throughput: 127.1617497456765 + estimated_peak_memory_range: + min: 20480 + max: 13081680 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: jep2r93xg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:21:53.248417Z' diff --git a/qai_hub_models/models/fcn_resnet50/test.py b/qai_hub_models/models/fcn_resnet50/test.py new file mode 100644 index 00000000..c180446f --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50/test.py @@ -0,0 +1,44 @@ +import numpy as np + +from qai_hub_models.models.fcn_resnet50.app import FCN_ResNet50App +from qai_hub_models.models.fcn_resnet50.demo import INPUT_IMAGE_ADDRESS +from qai_hub_models.models.fcn_resnet50.demo import main as demo_main +from qai_hub_models.models.fcn_resnet50.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + FCN_ResNet50, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_LOCAL_PATH = "fcn_demo_output.png" +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, OUTPUT_IMAGE_LOCAL_PATH +) + + +def _test_impl(app: FCN_ResNet50App): + image = load_image(INPUT_IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app_output_image = app.predict(image, False) + + np.testing.assert_allclose( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_task(): + _test_impl(FCN_ResNet50App(FCN_ResNet50.from_pretrained())) + + +@skip_clone_repo_check +def test_trace(): + _test_impl(FCN_ResNet50App(FCN_ResNet50.from_pretrained().convert_to_torchscript())) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_122ns_lowres/README.md b/qai_hub_models/models/ffnet_122ns_lowres/README.md new file mode 100644 index 00000000..348254c4 --- /dev/null +++ b/qai_hub_models/models/ffnet_122ns_lowres/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FFNet-122NS-LowRes: Semantic segmentation for automotive street scenes](https://aihub.qualcomm.com/models/ffnet_122ns_lowres) + +FFNet-122NS-LowRes is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. + +This is based on the implementation of FFNet-122NS-LowRes found +[here](https://github.com/Qualcomm-AI-research/FFNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_122ns_lowres). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[ffnet_122ns_lowres]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.ffnet_122ns_lowres.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.ffnet_122ns_lowres.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FFNet-122NS-LowRes can be found + [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). + + +## References +* [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) +* [Source Model Implementation](https://github.com/Qualcomm-AI-research/FFNet) diff --git a/qai_hub_models/models/ffnet_122ns_lowres/__init__.py b/qai_hub_models/models/ffnet_122ns_lowres/__init__.py new file mode 100644 index 00000000..32539c86 --- /dev/null +++ b/qai_hub_models/models/ffnet_122ns_lowres/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.app import ( # noqa: F401 + CityscapesSegmentationApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import FFNet122NSLowRes as Model # noqa: F401 diff --git a/qai_hub_models/models/ffnet_122ns_lowres/demo.py b/qai_hub_models/models/ffnet_122ns_lowres/demo.py new file mode 100644 index 00000000..df6c1ebd --- /dev/null +++ b/qai_hub_models/models/ffnet_122ns_lowres/demo.py @@ -0,0 +1,12 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.demo import ( + cityscapes_segmentation_demo, +) +from qai_hub_models.models.ffnet_122ns_lowres.model import MODEL_ID, FFNet122NSLowRes + + +def main(is_test: bool = False): + cityscapes_segmentation_demo(FFNet122NSLowRes, MODEL_ID, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_122ns_lowres/export.py b/qai_hub_models/models/ffnet_122ns_lowres/export.py new file mode 100644 index 00000000..c4283352 --- /dev/null +++ b/qai_hub_models/models/ffnet_122ns_lowres/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.ffnet_122ns_lowres import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "ffnet_122ns_lowres" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "ffnet_122ns_lowres", + "FFNet-122NS-LowRes", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_122ns_lowres/info.yaml b/qai_hub_models/models/ffnet_122ns_lowres/info.yaml new file mode 100644 index 00000000..1307ae0e --- /dev/null +++ b/qai_hub_models/models/ffnet_122ns_lowres/info.yaml @@ -0,0 +1,34 @@ +name: FFNet-122NS-LowRes +# id must match with the model dir name in qai_hub_models +id: ffnet_122ns_lowres +status: public +headline: Semantic segmentation for automotive street scenes. +domain: Computer Vision +description: FFNet-122NS-LowRes is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. +use_case: Semantic Segmentation +tags: [] +research_paper: https://arxiv.org/abs/2206.08236 +research_paper_title: "Simple and Efficient Architectures for Semantic Segmentation" +license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +source_repo: https://github.com/Qualcomm-AI-research/FFNet +technical_details: + Number of parameters: 32.2M + Model size: TODO + Model checkpoint: ffnet122NS_CCC_cityscapes_state_dict_quarts_pre_down + Input resolution: 1024x512 +applicable_scenarios: + - Automotive + - Autonomous Driving + - Camera +related_models: + - ffnet_78s_lowres + - ffnet_54s + - unet_segmentation +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - cityscapes diff --git a/qai_hub_models/models/ffnet_122ns_lowres/model.py b/qai_hub_models/models/ffnet_122ns_lowres/model.py new file mode 100644 index 00000000..021b404d --- /dev/null +++ b/qai_hub_models/models/ffnet_122ns_lowres/model.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.ffnet.model import FFNetLowRes + +MODEL_ID = __name__.split(".")[-2] + + +class FFNet122NSLowRes(FFNetLowRes): + @classmethod + def from_pretrained(cls) -> FFNet122NSLowRes: + return FFNetLowRes.from_pretrained.__func__( + cls, "segmentation_ffnet122NS_CCC_mobile_pre_down" + ) diff --git a/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml b/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml new file mode 100644 index 00000000..f41f23a1 --- /dev/null +++ b/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FFNet-122NS-LowRes + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 10460.0 + throughput: 95.60229445506693 + estimated_peak_memory_range: + min: 643072 + max: 2912400 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 216 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 216 + job_id: jqpyojnr5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 10778.0 + throughput: 92.78159213212099 + estimated_peak_memory_range: + min: 6332416 + max: 39442976 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 349 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 349 + job_id: j2p0m2k2g + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:22:01.714758Z' diff --git a/qai_hub_models/models/ffnet_122ns_lowres/requirements.txt b/qai_hub_models/models/ffnet_122ns_lowres/requirements.txt new file mode 100644 index 00000000..73ad8aa8 --- /dev/null +++ b/qai_hub_models/models/ffnet_122ns_lowres/requirements.txt @@ -0,0 +1 @@ +scikit-image>=0.21.0 diff --git a/qai_hub_models/models/ffnet_122ns_lowres/test.py b/qai_hub_models/models/ffnet_122ns_lowres/test.py new file mode 100644 index 00000000..0fefd2ed --- /dev/null +++ b/qai_hub_models/models/ffnet_122ns_lowres/test.py @@ -0,0 +1,16 @@ +from qai_hub_models.models._shared.ffnet.test_utils import run_test_off_target_numerical +from qai_hub_models.models.ffnet_122ns_lowres.demo import main as demo_main +from qai_hub_models.models.ffnet_122ns_lowres.model import FFNet122NSLowRes +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_off_target_numerical(): + run_test_off_target_numerical( + FFNet122NSLowRes, "segmentation_ffnet122NS_CCC_mobile_pre_down" + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_40s/README.md b/qai_hub_models/models/ffnet_40s/README.md new file mode 100644 index 00000000..33ac664e --- /dev/null +++ b/qai_hub_models/models/ffnet_40s/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FFNet-40S: Semantic segmentation for automotive street scenes](https://aihub.qualcomm.com/models/ffnet_40s) + +FFNet-40S is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. + +This is based on the implementation of FFNet-40S found +[here](https://github.com/Qualcomm-AI-research/FFNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_40s). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[ffnet_40s]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.ffnet_40s.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.ffnet_40s.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FFNet-40S can be found + [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). + + +## References +* [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) +* [Source Model Implementation](https://github.com/Qualcomm-AI-research/FFNet) diff --git a/qai_hub_models/models/ffnet_40s/__init__.py b/qai_hub_models/models/ffnet_40s/__init__.py new file mode 100644 index 00000000..41184f5f --- /dev/null +++ b/qai_hub_models/models/ffnet_40s/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.app import ( # noqa: F401 + CityscapesSegmentationApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import FFNet40S as Model # noqa: F401 diff --git a/qai_hub_models/models/ffnet_40s/demo.py b/qai_hub_models/models/ffnet_40s/demo.py new file mode 100644 index 00000000..29461f74 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s/demo.py @@ -0,0 +1,12 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.demo import ( + cityscapes_segmentation_demo, +) +from qai_hub_models.models.ffnet_40s.model import MODEL_ID, FFNet40S + + +def main(is_test: bool = False): + cityscapes_segmentation_demo(FFNet40S, MODEL_ID, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_40s/export.py b/qai_hub_models/models/ffnet_40s/export.py new file mode 100644 index 00000000..633484bc --- /dev/null +++ b/qai_hub_models/models/ffnet_40s/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.ffnet_40s import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "ffnet_40s" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "ffnet_40s", + "FFNet-40S", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_40s/info.yaml b/qai_hub_models/models/ffnet_40s/info.yaml new file mode 100644 index 00000000..d6608728 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s/info.yaml @@ -0,0 +1,35 @@ +name: FFNet-40S +# id must match with the model dir name in qai_hub_models +id: ffnet_40s +status: public +headline: Semantic segmentation for automotive street scenes. +domain: Computer Vision +description: FFNet-40S is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. +use_case: Semantic Segmentation +tags: + - real-time +research_paper: https://arxiv.org/abs/2206.08236 +research_paper_title: "Simple and Efficient Architectures for Semantic Segmentation" +license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +source_repo: https://github.com/Qualcomm-AI-research/FFNet +technical_details: + Number of parameters: 13.9M + Model size: 56 MB + Model checkpoint: ffnet40S_dBBB_cityscapes_state_dict_quarts + Input resolution: 2048x1024 +applicable_scenarios: + - Automotive + - Autonomous Driving + - Camera +related_models: + - ffnet_54s + - ffnet_78s + - deeplabv3_plus_mobilenet +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - cityscapes diff --git a/qai_hub_models/models/ffnet_40s/model.py b/qai_hub_models/models/ffnet_40s/model.py new file mode 100644 index 00000000..e404dc17 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s/model.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.ffnet.model import FFNet + +MODEL_ID = __name__.split(".")[-2] + + +class FFNet40S(FFNet): + @classmethod + def from_pretrained(cls) -> FFNet40S: + return FFNet.from_pretrained.__func__(cls, "segmentation_ffnet40S_dBBB_mobile") diff --git a/qai_hub_models/models/ffnet_40s/perf.yaml b/qai_hub_models/models/ffnet_40s/perf.yaml new file mode 100644 index 00000000..cd79f677 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FFNet-40S + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 22739.0 + throughput: 43.97730770922204 + estimated_peak_memory_range: + min: 2564096 + max: 5001048 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 92 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 92 + job_id: jegnzm9vg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 17313.0 + throughput: 57.760064691272454 + estimated_peak_memory_range: + min: 25202688 + max: 51306904 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 141 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 141 + job_id: jep2r97xg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:14:26.648274Z' diff --git a/qai_hub_models/models/ffnet_40s/requirements.txt b/qai_hub_models/models/ffnet_40s/requirements.txt new file mode 100644 index 00000000..73ad8aa8 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s/requirements.txt @@ -0,0 +1 @@ +scikit-image>=0.21.0 diff --git a/qai_hub_models/models/ffnet_40s/test.py b/qai_hub_models/models/ffnet_40s/test.py new file mode 100644 index 00000000..fa78314c --- /dev/null +++ b/qai_hub_models/models/ffnet_40s/test.py @@ -0,0 +1,14 @@ +from qai_hub_models.models._shared.ffnet.test_utils import run_test_off_target_numerical +from qai_hub_models.models.ffnet_40s.demo import main as demo_main +from qai_hub_models.models.ffnet_40s.model import FFNet40S +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_off_target_numerical(): + run_test_off_target_numerical(FFNet40S, "segmentation_ffnet40S_dBBB_mobile") + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_40s_quantized/README.md b/qai_hub_models/models/ffnet_40s_quantized/README.md new file mode 100644 index 00000000..abb8c6d4 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FFNet-40S-Quantized: Semantic segmentation for automotive street scenes](https://aihub.qualcomm.com/models/ffnet_40s_quantized) + +FFNet-40S-Quantized is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. + +This is based on the implementation of FFNet-40S-Quantized found +[here](https://github.com/Qualcomm-AI-research/FFNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_40s_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.ffnet_40s_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.ffnet_40s_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FFNet-40S-Quantized can be found + [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). + + +## References +* [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) +* [Source Model Implementation](https://github.com/Qualcomm-AI-research/FFNet) diff --git a/qai_hub_models/models/ffnet_40s_quantized/__init__.py b/qai_hub_models/models/ffnet_40s_quantized/__init__.py new file mode 100644 index 00000000..fb0bfdb4 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s_quantized/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.app import ( # noqa: F401 + CityscapesSegmentationApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import FFNet40SQuantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/ffnet_40s_quantized/demo.py b/qai_hub_models/models/ffnet_40s_quantized/demo.py new file mode 100644 index 00000000..63383b7c --- /dev/null +++ b/qai_hub_models/models/ffnet_40s_quantized/demo.py @@ -0,0 +1,15 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.demo import ( + cityscapes_segmentation_demo, +) +from qai_hub_models.models.ffnet_40s_quantized.model import ( + MODEL_ID, + FFNet40SQuantizable, +) + + +def main(is_test: bool = False): + cityscapes_segmentation_demo(FFNet40SQuantizable, MODEL_ID, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_40s_quantized/export.py b/qai_hub_models/models/ffnet_40s_quantized/export.py new file mode 100644 index 00000000..573e4ea2 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s_quantized/export.py @@ -0,0 +1,200 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.ffnet_40s_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "ffnet_40s_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "ffnet_40s_quantized", + "FFNet-40S-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_40s_quantized/info.yaml b/qai_hub_models/models/ffnet_40s_quantized/info.yaml new file mode 100644 index 00000000..5d8e7f91 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s_quantized/info.yaml @@ -0,0 +1,38 @@ +name: FFNet-40S-Quantized +# id must match with the model dir name in qai_hub_models +id: ffnet_40s_quantized +status: public +headline: Semantic segmentation for automotive street scenes. +domain: Computer Vision +description: FFNet-40S-Quantized is a "fuss-free network" that segments street scene + images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the + Cityscapes dataset. +use_case: Semantic Segmentation +tags: +- quantized +- real-time +research_paper: https://arxiv.org/abs/2206.08236 +research_paper_title: Simple and Efficient Architectures for Semantic Segmentation +license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +source_repo: https://github.com/Qualcomm-AI-research/FFNet +technical_details: + Number of parameters: 13.9M + Model size: 13.5 MB + Model checkpoint: ffnet40S_dBBB_cityscapes_state_dict_quarts + Input resolution: 2048x1024 +applicable_scenarios: +- Automotive +- Autonomous Driving +- Camera +related_models: +- ffnet_40s +- ffnet_54s_quantized +- ffnet_78s_quantized +form_factors: +- Phone +- Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: +- cityscapes diff --git a/qai_hub_models/models/ffnet_40s_quantized/model.py b/qai_hub_models/models/ffnet_40s_quantized/model.py new file mode 100644 index 00000000..d4b395fd --- /dev/null +++ b/qai_hub_models/models/ffnet_40s_quantized/model.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.ffnet_quantized.model import FFNetQuantizable +from qai_hub_models.models.ffnet_40s.model import FFNet40S +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "encodings.json" + + +class FFNet40SQuantizable(FFNetQuantizable, FFNet40S): + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> FFNet40SQuantizable: + return FFNetQuantizable.from_pretrained.__func__( + cls, + "segmentation_ffnet40S_dBBB_mobile", + aimet_encodings=aimet_encodings, + ) + + @classmethod + def default_aimet_encodings(cls) -> str: + return CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() diff --git a/qai_hub_models/models/ffnet_40s_quantized/perf.yaml b/qai_hub_models/models/ffnet_40s_quantized/perf.yaml new file mode 100644 index 00000000..d0408e2b --- /dev/null +++ b/qai_hub_models/models/ffnet_40s_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FFNet-40S-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 6451.0 + throughput: 155.0147263990079 + estimated_peak_memory_range: + min: 851968 + max: 2582296 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 97 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 97 + job_id: j0pxl6x9p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:15:22.015621Z' diff --git a/qai_hub_models/models/ffnet_40s_quantized/test.py b/qai_hub_models/models/ffnet_40s_quantized/test.py new file mode 100644 index 00000000..ff722757 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s_quantized/test.py @@ -0,0 +1,17 @@ +from qai_hub_models.models._shared.ffnet.test_utils import run_test_off_target_numerical +from qai_hub_models.models.ffnet_40s_quantized.demo import main as demo_main +from qai_hub_models.models.ffnet_40s_quantized.model import FFNet40SQuantizable +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_off_target_numerical(): + run_test_off_target_numerical( + FFNet40SQuantizable, + "segmentation_ffnet40S_dBBB_mobile", + relax_numerics=True, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_54s/README.md b/qai_hub_models/models/ffnet_54s/README.md new file mode 100644 index 00000000..81639336 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FFNet-54S: Semantic segmentation for automotive street scenes](https://aihub.qualcomm.com/models/ffnet_54s) + +FFNet-54S is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. + +This is based on the implementation of FFNet-54S found +[here](https://github.com/Qualcomm-AI-research/FFNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_54s). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[ffnet_54s]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.ffnet_54s.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.ffnet_54s.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FFNet-54S can be found + [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). + + +## References +* [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) +* [Source Model Implementation](https://github.com/Qualcomm-AI-research/FFNet) diff --git a/qai_hub_models/models/ffnet_54s/__init__.py b/qai_hub_models/models/ffnet_54s/__init__.py new file mode 100644 index 00000000..ceb6c497 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.app import ( # noqa: F401 + CityscapesSegmentationApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import FFNet54S as Model # noqa: F401 diff --git a/qai_hub_models/models/ffnet_54s/demo.py b/qai_hub_models/models/ffnet_54s/demo.py new file mode 100644 index 00000000..6da2da13 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s/demo.py @@ -0,0 +1,12 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.demo import ( + cityscapes_segmentation_demo, +) +from qai_hub_models.models.ffnet_54s.model import MODEL_ID, FFNet54S + + +def main(is_test: bool = False): + cityscapes_segmentation_demo(FFNet54S, MODEL_ID, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_54s/export.py b/qai_hub_models/models/ffnet_54s/export.py new file mode 100644 index 00000000..7323ecb5 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.ffnet_54s import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "ffnet_54s" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "ffnet_54s", + "FFNet-54S", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_54s/info.yaml b/qai_hub_models/models/ffnet_54s/info.yaml new file mode 100644 index 00000000..08654815 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s/info.yaml @@ -0,0 +1,34 @@ +name: FFNet-54S +# id must match with the model dir name in qai_hub_models +id: ffnet_54s +status: public +headline: Semantic segmentation for automotive street scenes. +domain: Computer Vision +description: FFNet-54S is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. +use_case: Semantic Segmentation +tags: [] +research_paper: https://arxiv.org/abs/2206.08236 +research_paper_title: "Simple and Efficient Architectures for Semantic Segmentation" +license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +source_repo: https://github.com/Qualcomm-AI-research/FFNet +technical_details: + Number of parameters: 18.1M + Model size: 69 MB + Model checkpoint: ffnet54S_dBBB_cityscapes_state_dict_quarts + Input resolution: 2048x1024 +applicable_scenarios: + - Automotive + - Autonomous Driving + - Camera +related_models: + - ffnet_40s + - ffnet_78s + - fcn_resnet50 +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - cityscapes diff --git a/qai_hub_models/models/ffnet_54s/model.py b/qai_hub_models/models/ffnet_54s/model.py new file mode 100644 index 00000000..a3064e7e --- /dev/null +++ b/qai_hub_models/models/ffnet_54s/model.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.ffnet.model import FFNet + +MODEL_ID = __name__.split(".")[-2] + + +class FFNet54S(FFNet): + @classmethod + def from_pretrained(cls) -> FFNet54S: + return FFNet.from_pretrained.__func__(cls, "segmentation_ffnet54S_dBBB_mobile") diff --git a/qai_hub_models/models/ffnet_54s/perf.yaml b/qai_hub_models/models/ffnet_54s/perf.yaml new file mode 100644 index 00000000..13243d9e --- /dev/null +++ b/qai_hub_models/models/ffnet_54s/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FFNet-54S + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 25261.0 + throughput: 39.58671469854717 + estimated_peak_memory_range: + min: 2551808 + max: 4912232 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 113 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 113 + job_id: jygzlj8z5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 20585.0 + throughput: 48.57906242409521 + estimated_peak_memory_range: + min: 25206784 + max: 41071808 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 176 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 176 + job_id: jz5wl38zp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:31:19.360420Z' diff --git a/qai_hub_models/models/ffnet_54s/requirements.txt b/qai_hub_models/models/ffnet_54s/requirements.txt new file mode 100644 index 00000000..73ad8aa8 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s/requirements.txt @@ -0,0 +1 @@ +scikit-image>=0.21.0 diff --git a/qai_hub_models/models/ffnet_54s/test.py b/qai_hub_models/models/ffnet_54s/test.py new file mode 100644 index 00000000..dbd9d71e --- /dev/null +++ b/qai_hub_models/models/ffnet_54s/test.py @@ -0,0 +1,14 @@ +from qai_hub_models.models._shared.ffnet.test_utils import run_test_off_target_numerical +from qai_hub_models.models.ffnet_54s.demo import main as demo_main +from qai_hub_models.models.ffnet_54s.model import FFNet54S +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_off_target_numerical(): + run_test_off_target_numerical(FFNet54S, "segmentation_ffnet54S_dBBB_mobile") + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_54s_quantized/README.md b/qai_hub_models/models/ffnet_54s_quantized/README.md new file mode 100644 index 00000000..d978a2e7 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FFNet-54S-Quantized: Semantic segmentation for automotive street scenes](https://aihub.qualcomm.com/models/ffnet_54s_quantized) + +FFNet-54S-Quantized is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. + +This is based on the implementation of FFNet-54S-Quantized found +[here](https://github.com/Qualcomm-AI-research/FFNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_54s_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.ffnet_54s_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.ffnet_54s_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FFNet-54S-Quantized can be found + [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). + + +## References +* [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) +* [Source Model Implementation](https://github.com/Qualcomm-AI-research/FFNet) diff --git a/qai_hub_models/models/ffnet_54s_quantized/__init__.py b/qai_hub_models/models/ffnet_54s_quantized/__init__.py new file mode 100644 index 00000000..a388bcc5 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s_quantized/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.app import ( # noqa: F401 + CityscapesSegmentationApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import FFNet54SQuantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/ffnet_54s_quantized/demo.py b/qai_hub_models/models/ffnet_54s_quantized/demo.py new file mode 100644 index 00000000..2c8e2cf7 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s_quantized/demo.py @@ -0,0 +1,15 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.demo import ( + cityscapes_segmentation_demo, +) +from qai_hub_models.models.ffnet_54s_quantized.model import ( + MODEL_ID, + FFNet54SQuantizable, +) + + +def main(is_test: bool = False): + cityscapes_segmentation_demo(FFNet54SQuantizable, MODEL_ID, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_54s_quantized/export.py b/qai_hub_models/models/ffnet_54s_quantized/export.py new file mode 100644 index 00000000..1615d354 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s_quantized/export.py @@ -0,0 +1,200 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.ffnet_54s_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "ffnet_54s_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "ffnet_54s_quantized", + "FFNet-54S-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_54s_quantized/info.yaml b/qai_hub_models/models/ffnet_54s_quantized/info.yaml new file mode 100644 index 00000000..ddd3ed4d --- /dev/null +++ b/qai_hub_models/models/ffnet_54s_quantized/info.yaml @@ -0,0 +1,38 @@ +name: FFNet-54S-Quantized +# id must match with the model dir name in qai_hub_models +id: ffnet_54s_quantized +status: public +headline: Semantic segmentation for automotive street scenes. +domain: Computer Vision +description: FFNet-54S-Quantized is a "fuss-free network" that segments street scene + images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the + Cityscapes dataset. +use_case: Semantic Segmentation +tags: +- quantized +- real-time +research_paper: https://arxiv.org/abs/2206.08236 +research_paper_title: Simple and Efficient Architectures for Semantic Segmentation +license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +source_repo: https://github.com/Qualcomm-AI-research/FFNet +technical_details: + Number of parameters: 18.0M + Model size: 17.5 MB + Model checkpoint: ffnet54S_dBBB_cityscapes_state_dict_quarts + Input resolution: 2048x1024 +applicable_scenarios: +- Automotive +- Autonomous Driving +- Camera +related_models: +- ffnet_54s +- ffnet_40s_quantized +- ffnet_78s_quantized +form_factors: +- Phone +- Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: +- cityscapes diff --git a/qai_hub_models/models/ffnet_54s_quantized/model.py b/qai_hub_models/models/ffnet_54s_quantized/model.py new file mode 100644 index 00000000..2c6d7585 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s_quantized/model.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.ffnet_quantized.model import FFNetQuantizable +from qai_hub_models.models.ffnet_54s.model import FFNet54S +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "encodings.json" + + +class FFNet54SQuantizable(FFNetQuantizable, FFNet54S): + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> FFNet54SQuantizable: + return FFNetQuantizable.from_pretrained.__func__( + cls, "segmentation_ffnet54S_dBBB_mobile", aimet_encodings=aimet_encodings + ) + + @classmethod + def default_aimet_encodings(cls) -> str: + return CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() diff --git a/qai_hub_models/models/ffnet_54s_quantized/perf.yaml b/qai_hub_models/models/ffnet_54s_quantized/perf.yaml new file mode 100644 index 00000000..4e723a91 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FFNet-54S-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 7130.0 + throughput: 140.25245441795232 + estimated_peak_memory_range: + min: 643072 + max: 23970880 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 118 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 118 + job_id: jep2r9wmg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:19:49.268425Z' diff --git a/qai_hub_models/models/ffnet_54s_quantized/test.py b/qai_hub_models/models/ffnet_54s_quantized/test.py new file mode 100644 index 00000000..9050ee74 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s_quantized/test.py @@ -0,0 +1,17 @@ +from qai_hub_models.models._shared.ffnet.test_utils import run_test_off_target_numerical +from qai_hub_models.models.ffnet_54s_quantized.demo import main as demo_main +from qai_hub_models.models.ffnet_54s_quantized.model import FFNet54SQuantizable +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_off_target_numerical(): + run_test_off_target_numerical( + FFNet54SQuantizable, + "segmentation_ffnet54S_dBBB_mobile", + relax_numerics=True, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_78s/README.md b/qai_hub_models/models/ffnet_78s/README.md new file mode 100644 index 00000000..016fde5d --- /dev/null +++ b/qai_hub_models/models/ffnet_78s/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FFNet-78S: Semantic segmentation for automotive street scenes](https://aihub.qualcomm.com/models/ffnet_78s) + +FFNet-78S is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. + +This is based on the implementation of FFNet-78S found +[here](https://github.com/Qualcomm-AI-research/FFNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_78s). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[ffnet_78s]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.ffnet_78s.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.ffnet_78s.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FFNet-78S can be found + [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). + + +## References +* [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) +* [Source Model Implementation](https://github.com/Qualcomm-AI-research/FFNet) diff --git a/qai_hub_models/models/ffnet_78s/__init__.py b/qai_hub_models/models/ffnet_78s/__init__.py new file mode 100644 index 00000000..bfbb4681 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.app import ( # noqa: F401 + CityscapesSegmentationApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import FFNet78S as Model # noqa: F401 diff --git a/qai_hub_models/models/ffnet_78s/demo.py b/qai_hub_models/models/ffnet_78s/demo.py new file mode 100644 index 00000000..2b9fb3ac --- /dev/null +++ b/qai_hub_models/models/ffnet_78s/demo.py @@ -0,0 +1,12 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.demo import ( + cityscapes_segmentation_demo, +) +from qai_hub_models.models.ffnet_78s.model import MODEL_ID, FFNet78S + + +def main(is_test: bool = False): + cityscapes_segmentation_demo(FFNet78S, MODEL_ID, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_78s/export.py b/qai_hub_models/models/ffnet_78s/export.py new file mode 100644 index 00000000..daa51b1b --- /dev/null +++ b/qai_hub_models/models/ffnet_78s/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.ffnet_78s import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "ffnet_78s" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "ffnet_78s", + "FFNet-78S", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_78s/info.yaml b/qai_hub_models/models/ffnet_78s/info.yaml new file mode 100644 index 00000000..4096a926 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s/info.yaml @@ -0,0 +1,34 @@ +name: FFNet-78S +# id must match with the model dir name in qai_hub_models +id: ffnet_78s +status: public +headline: Semantic segmentation for automotive street scenes. +domain: Computer Vision +description: FFNet-78S is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. +use_case: Semantic Segmentation +tags: [] +research_paper: https://arxiv.org/abs/2206.08236 +research_paper_title: "Simple and Efficient Architectures for Semantic Segmentation" +license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +source_repo: https://github.com/Qualcomm-AI-research/FFNet +technical_details: + Number of parameters: 27.5M + Model size: 105 MB + Model checkpoint: ffnet78S_dBBB_cityscapes_state_dict_quarts + Input resolution: 2048x1024 +applicable_scenarios: + - Automotive + - Autonomous Driving + - Camera +related_models: + - ffnet_40s + - ffnet_54s + - unet_segmentation +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - cityscapes diff --git a/qai_hub_models/models/ffnet_78s/model.py b/qai_hub_models/models/ffnet_78s/model.py new file mode 100644 index 00000000..a0b65a37 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s/model.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.ffnet.model import FFNet + +MODEL_ID = __name__.split(".")[-2] + + +class FFNet78S(FFNet): + @classmethod + def from_pretrained(cls) -> FFNet78S: + return FFNet.from_pretrained.__func__(cls, "segmentation_ffnet78S_dBBB_mobile") diff --git a/qai_hub_models/models/ffnet_78s/perf.yaml b/qai_hub_models/models/ffnet_78s/perf.yaml new file mode 100644 index 00000000..2db45bd8 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FFNet-78S + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 29611.0 + throughput: 33.77123366316572 + estimated_peak_memory_range: + min: 2596864 + max: 5429112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 149 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 149 + job_id: jep2r9emg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 24120.0 + throughput: 41.459369817578775 + estimated_peak_memory_range: + min: 2215936 + max: 32957000 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 236 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 236 + job_id: jqpyojm45 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:36:14.251855Z' diff --git a/qai_hub_models/models/ffnet_78s/requirements.txt b/qai_hub_models/models/ffnet_78s/requirements.txt new file mode 100644 index 00000000..73ad8aa8 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s/requirements.txt @@ -0,0 +1 @@ +scikit-image>=0.21.0 diff --git a/qai_hub_models/models/ffnet_78s/test.py b/qai_hub_models/models/ffnet_78s/test.py new file mode 100644 index 00000000..42cbf856 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s/test.py @@ -0,0 +1,14 @@ +from qai_hub_models.models._shared.ffnet.test_utils import run_test_off_target_numerical +from qai_hub_models.models.ffnet_78s.demo import main as demo_main +from qai_hub_models.models.ffnet_78s.model import FFNet78S +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_off_target_numerical(): + run_test_off_target_numerical(FFNet78S, "segmentation_ffnet78S_dBBB_mobile") + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_78s_lowres/README.md b/qai_hub_models/models/ffnet_78s_lowres/README.md new file mode 100644 index 00000000..5b8d5e2d --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_lowres/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FFNet-78S-LowRes: Semantic segmentation for automotive street scenes](https://aihub.qualcomm.com/models/ffnet_78s_lowres) + +FFNet-78S-LowRes is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. + +This is based on the implementation of FFNet-78S-LowRes found +[here](https://github.com/Qualcomm-AI-research/FFNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_78s_lowres). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[ffnet_78s_lowres]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.ffnet_78s_lowres.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.ffnet_78s_lowres.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FFNet-78S-LowRes can be found + [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). + + +## References +* [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) +* [Source Model Implementation](https://github.com/Qualcomm-AI-research/FFNet) diff --git a/qai_hub_models/models/ffnet_78s_lowres/__init__.py b/qai_hub_models/models/ffnet_78s_lowres/__init__.py new file mode 100644 index 00000000..381955f0 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_lowres/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.app import ( # noqa: F401 + CityscapesSegmentationApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import FFNet78SLowRes as Model # noqa: F401 diff --git a/qai_hub_models/models/ffnet_78s_lowres/demo.py b/qai_hub_models/models/ffnet_78s_lowres/demo.py new file mode 100644 index 00000000..a28ba3ba --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_lowres/demo.py @@ -0,0 +1,12 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.demo import ( + cityscapes_segmentation_demo, +) +from qai_hub_models.models.ffnet_78s_lowres.model import MODEL_ID, FFNet78SLowRes + + +def main(is_test: bool = False): + cityscapes_segmentation_demo(FFNet78SLowRes, MODEL_ID, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_78s_lowres/export.py b/qai_hub_models/models/ffnet_78s_lowres/export.py new file mode 100644 index 00000000..21980b56 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_lowres/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.ffnet_78s_lowres import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "ffnet_78s_lowres" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "ffnet_78s_lowres", + "FFNet-78S-LowRes", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_78s_lowres/info.yaml b/qai_hub_models/models/ffnet_78s_lowres/info.yaml new file mode 100644 index 00000000..3c9c9d4e --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_lowres/info.yaml @@ -0,0 +1,35 @@ +name: FFNet-78S-LowRes +# id must match with the model dir name in qai_hub_models +id: ffnet_78s_lowres +status: public +headline: Semantic segmentation for automotive street scenes. +domain: Computer Vision +description: FFNet-78S-LowRes is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. +use_case: Semantic Segmentation +tags: + - real-time +research_paper: https://arxiv.org/abs/2206.08236 +research_paper_title: "Simple and Efficient Architectures for Semantic Segmentation" +license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +source_repo: https://github.com/Qualcomm-AI-research/FFNet +technical_details: + Number of parameters: 26.9M + Model size: 107.3 MB + Model checkpoint: ffnet78S_BCC_cityscapes_state_dict_quarts_pre_down + Input resolution: 1024x512 +applicable_scenarios: + - Automotive + - Autonomous Driving + - Camera +related_models: + - ffnet_122ns_lowres + - ffnet_54s + - unet_segmentation +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - cityscapes diff --git a/qai_hub_models/models/ffnet_78s_lowres/model.py b/qai_hub_models/models/ffnet_78s_lowres/model.py new file mode 100644 index 00000000..72a5fd8c --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_lowres/model.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.ffnet.model import FFNetLowRes + +MODEL_ID = __name__.split(".")[-2] + + +class FFNet78SLowRes(FFNetLowRes): + @classmethod + def from_pretrained(cls) -> FFNet78SLowRes: + return FFNetLowRes.from_pretrained.__func__( + cls, "segmentation_ffnet78S_BCC_mobile_pre_down" + ) diff --git a/qai_hub_models/models/ffnet_78s_lowres/perf.yaml b/qai_hub_models/models/ffnet_78s_lowres/perf.yaml new file mode 100644 index 00000000..4fceeb4f --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_lowres/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FFNet-78S-LowRes + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 10833.0 + throughput: 92.31053263177328 + estimated_peak_memory_range: + min: 671744 + max: 3588808 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 149 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 149 + job_id: j0pxl6d9p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 11410.0 + throughput: 87.64241893076249 + estimated_peak_memory_range: + min: 565248 + max: 42397168 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 237 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 237 + job_id: jegnzm7mg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:29:56.988054Z' diff --git a/qai_hub_models/models/ffnet_78s_lowres/requirements.txt b/qai_hub_models/models/ffnet_78s_lowres/requirements.txt new file mode 100644 index 00000000..73ad8aa8 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_lowres/requirements.txt @@ -0,0 +1 @@ +scikit-image>=0.21.0 diff --git a/qai_hub_models/models/ffnet_78s_lowres/test.py b/qai_hub_models/models/ffnet_78s_lowres/test.py new file mode 100644 index 00000000..d3b2a8e2 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_lowres/test.py @@ -0,0 +1,16 @@ +from qai_hub_models.models._shared.ffnet.test_utils import run_test_off_target_numerical +from qai_hub_models.models.ffnet_78s_lowres.demo import main as demo_main +from qai_hub_models.models.ffnet_78s_lowres.model import FFNet78SLowRes +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_off_target_numerical(): + run_test_off_target_numerical( + FFNet78SLowRes, "segmentation_ffnet78S_BCC_mobile_pre_down" + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_78s_quantized/README.md b/qai_hub_models/models/ffnet_78s_quantized/README.md new file mode 100644 index 00000000..74185134 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FFNet-78S-Quantized: Semantic segmentation for automotive street scenes](https://aihub.qualcomm.com/models/ffnet_78s_quantized) + +FFNet-78S-Quantized is a "fuss-free network" that segments street scene images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the Cityscapes dataset. + +This is based on the implementation of FFNet-78S-Quantized found +[here](https://github.com/Qualcomm-AI-research/FFNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_78s_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.ffnet_78s_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.ffnet_78s_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of FFNet-78S-Quantized can be found + [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). + + +## References +* [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) +* [Source Model Implementation](https://github.com/Qualcomm-AI-research/FFNet) diff --git a/qai_hub_models/models/ffnet_78s_quantized/__init__.py b/qai_hub_models/models/ffnet_78s_quantized/__init__.py new file mode 100644 index 00000000..d527e81b --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_quantized/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.app import ( # noqa: F401 + CityscapesSegmentationApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import FFNet78SQuantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/ffnet_78s_quantized/demo.py b/qai_hub_models/models/ffnet_78s_quantized/demo.py new file mode 100644 index 00000000..4ec21d05 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_quantized/demo.py @@ -0,0 +1,15 @@ +from qai_hub_models.models._shared.cityscapes_segmentation.demo import ( + cityscapes_segmentation_demo, +) +from qai_hub_models.models.ffnet_78s_quantized.model import ( + MODEL_ID, + FFNet78SQuantizable, +) + + +def main(is_test: bool = False): + cityscapes_segmentation_demo(FFNet78SQuantizable, MODEL_ID, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_78s_quantized/export.py b/qai_hub_models/models/ffnet_78s_quantized/export.py new file mode 100644 index 00000000..661fee65 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_quantized/export.py @@ -0,0 +1,200 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.ffnet_78s_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "ffnet_78s_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "ffnet_78s_quantized", + "FFNet-78S-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/ffnet_78s_quantized/info.yaml b/qai_hub_models/models/ffnet_78s_quantized/info.yaml new file mode 100644 index 00000000..8864452c --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_quantized/info.yaml @@ -0,0 +1,38 @@ +name: FFNet-78S-Quantized +# id must match with the model dir name in qai_hub_models +id: ffnet_78s_quantized +status: public +headline: Semantic segmentation for automotive street scenes. +domain: Computer Vision +description: FFNet-78S-Quantized is a "fuss-free network" that segments street scene + images with per-pixel classes like road, sidewalk, and pedestrian. Trained on the + Cityscapes dataset. +use_case: Semantic Segmentation +tags: +- quantized +- real-time +research_paper: https://arxiv.org/abs/2206.08236 +research_paper_title: Simple and Efficient Architectures for Semantic Segmentation +license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +source_repo: https://github.com/Qualcomm-AI-research/FFNet +technical_details: + Number of parameters: 27.5M + Model size: 26.7 MB + Model checkpoint: ffnet78S_dBBB_cityscapes_state_dict_quarts + Input resolution: 2048x1024 +applicable_scenarios: +- Automotive +- Autonomous Driving +- Camera +related_models: +- ffnet_78s +- ffnet_40s_quantized +- ffnet_54s_quantized +form_factors: +- Phone +- Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: +- cityscapes diff --git a/qai_hub_models/models/ffnet_78s_quantized/model.py b/qai_hub_models/models/ffnet_78s_quantized/model.py new file mode 100644 index 00000000..8c4f8853 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_quantized/model.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.ffnet_quantized.model import FFNetQuantizable +from qai_hub_models.models.ffnet_78s.model import FFNet78S +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "encodings.json" + + +class FFNet78SQuantizable(FFNetQuantizable, FFNet78S): + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> FFNet78SQuantizable: + return FFNetQuantizable.from_pretrained.__func__( + cls, "segmentation_ffnet78S_dBBB_mobile", aimet_encodings=aimet_encodings + ) + + @classmethod + def default_aimet_encodings(cls) -> str: + return CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() diff --git a/qai_hub_models/models/ffnet_78s_quantized/perf.yaml b/qai_hub_models/models/ffnet_78s_quantized/perf.yaml new file mode 100644 index 00000000..17c2c4c9 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: FFNet-78S-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 8362.0 + throughput: 119.58861516383641 + estimated_peak_memory_range: + min: 655360 + max: 2403480 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 154 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 154 + job_id: j1gly2oe5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:13:29.270963Z' diff --git a/qai_hub_models/models/ffnet_78s_quantized/test.py b/qai_hub_models/models/ffnet_78s_quantized/test.py new file mode 100644 index 00000000..7f2f8648 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_quantized/test.py @@ -0,0 +1,17 @@ +from qai_hub_models.models._shared.ffnet.test_utils import run_test_off_target_numerical +from qai_hub_models.models.ffnet_78s_quantized.demo import main as demo_main +from qai_hub_models.models.ffnet_78s_quantized.model import FFNet78SQuantizable +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_off_target_numerical(): + run_test_off_target_numerical( + FFNet78SQuantizable, + "segmentation_ffnet78S_dBBB_mobile", + relax_numerics=True, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/googlenet/README.md b/qai_hub_models/models/googlenet/README.md new file mode 100644 index 00000000..3ceaaa5a --- /dev/null +++ b/qai_hub_models/models/googlenet/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [GoogLeNet: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/googlenet) + +GoogLeNet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of GoogLeNet found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/googlenet). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.googlenet.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.googlenet.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of GoogLeNet can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Going Deeper with Convolutions](https://arxiv.org/abs/1409.4842) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py) diff --git a/qai_hub_models/models/googlenet/__init__.py b/qai_hub_models/models/googlenet/__init__.py new file mode 100644 index 00000000..42454ca1 --- /dev/null +++ b/qai_hub_models/models/googlenet/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import GoogLeNet as Model # noqa: F401 diff --git a/qai_hub_models/models/googlenet/demo.py b/qai_hub_models/models/googlenet/demo.py new file mode 100644 index 00000000..8ab16dfa --- /dev/null +++ b/qai_hub_models/models/googlenet/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.googlenet.model import GoogLeNet + + +def main(is_test: bool = False): + imagenet_demo(GoogLeNet, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/googlenet/export.py b/qai_hub_models/models/googlenet/export.py new file mode 100644 index 00000000..222b1310 --- /dev/null +++ b/qai_hub_models/models/googlenet/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.googlenet import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "googlenet" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "googlenet", + "GoogLeNet", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/googlenet/info.yaml b/qai_hub_models/models/googlenet/info.yaml new file mode 100644 index 00000000..41598ee8 --- /dev/null +++ b/qai_hub_models/models/googlenet/info.yaml @@ -0,0 +1,37 @@ +name: GoogLeNet +# id must match with the model dir name in qai_hub_models +id: googlenet +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: GoogLeNet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: [] +research_paper: https://arxiv.org/abs/1409.4842 +research_paper_title: Going Deeper with Convolutions +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py +technical_details: + Number of parameters: 6.62M + Model size: 49.7 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - squeezenet1_1 +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/googlenet/model.py b/qai_hub_models/models/googlenet/model.py new file mode 100644 index 00000000..b2a84ee4 --- /dev/null +++ b/qai_hub_models/models/googlenet/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class GoogLeNet(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.googlenet(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/googlenet/perf.yaml b/qai_hub_models/models/googlenet/perf.yaml new file mode 100644 index 00000000..5a446a3a --- /dev/null +++ b/qai_hub_models/models/googlenet/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: GoogLeNet + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1471.0 + throughput: 679.8096532970768 + estimated_peak_memory_range: + min: 16384 + max: 1850752 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 94 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 94 + job_id: jw568z3vg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1808.0 + throughput: 553.0973451327434 + estimated_peak_memory_range: + min: 24576 + max: 31167584 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 156 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 156 + job_id: j1p3z14x5 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:07:34.463888Z' diff --git a/qai_hub_models/models/googlenet/test.py b/qai_hub_models/models/googlenet/test.py new file mode 100644 index 00000000..afd0c745 --- /dev/null +++ b/qai_hub_models/models/googlenet/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.googlenet.demo import main as demo_main +from qai_hub_models.models.googlenet.model import MODEL_ID, GoogLeNet + + +def test_task(): + run_imagenet_classifier_test(GoogLeNet.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(GoogLeNet.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/googlenet_quantized/README.md b/qai_hub_models/models/googlenet_quantized/README.md new file mode 100644 index 00000000..fd3b845b --- /dev/null +++ b/qai_hub_models/models/googlenet_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [GoogLeNetQuantized: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/googlenet_quantized) + +GoogLeNet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of GoogLeNetQuantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/googlenet_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.googlenet_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.googlenet_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of GoogLeNetQuantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Going Deeper with Convolutions](https://arxiv.org/abs/1409.4842) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py) diff --git a/qai_hub_models/models/googlenet_quantized/__init__.py b/qai_hub_models/models/googlenet_quantized/__init__.py new file mode 100644 index 00000000..c1e3c2e0 --- /dev/null +++ b/qai_hub_models/models/googlenet_quantized/__init__.py @@ -0,0 +1,7 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) +from qai_hub_models.models.googlenet_quantized.model import MODEL_ID # noqa: F401 +from qai_hub_models.models.googlenet_quantized.model import ( # noqa: F401 + GoogLeNetQuantizable as Model, +) diff --git a/qai_hub_models/models/googlenet_quantized/demo.py b/qai_hub_models/models/googlenet_quantized/demo.py new file mode 100644 index 00000000..97a58dcf --- /dev/null +++ b/qai_hub_models/models/googlenet_quantized/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.googlenet_quantized.model import GoogLeNetQuantizable + + +def main(is_test: bool = False): + imagenet_demo(GoogLeNetQuantizable, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/googlenet_quantized/export.py b/qai_hub_models/models/googlenet_quantized/export.py new file mode 100644 index 00000000..586146e8 --- /dev/null +++ b/qai_hub_models/models/googlenet_quantized/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.googlenet_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "googlenet_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "googlenet_quantized", + "GoogLeNetQuantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/googlenet_quantized/info.yaml b/qai_hub_models/models/googlenet_quantized/info.yaml new file mode 100644 index 00000000..c0d2d672 --- /dev/null +++ b/qai_hub_models/models/googlenet_quantized/info.yaml @@ -0,0 +1,38 @@ +name: GoogLeNetQuantized +# id must match with the model dir name in qai_hub_models +id: googlenet_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: GoogLeNet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - quantized +research_paper: https://arxiv.org/abs/1409.4842 +research_paper_title: Going Deeper with Convolutions +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py +technical_details: + Number of parameters: 6.62M + Model size: 49.7 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - squeezenet1_1 +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/googlenet_quantized/model.py b/qai_hub_models/models/googlenet_quantized/model.py new file mode 100644 index 00000000..268ab991 --- /dev/null +++ b/qai_hub_models/models/googlenet_quantized/model.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.googlenet.model import GoogLeNet +from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "googlenet_quantized_encodings.json" + + +class GoogLeNetQuantizable(AIMETQuantizableMixin, GoogLeNet): + """GoogleNet with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + sim_model: QuantizationSimModel, + ) -> None: + GoogLeNet.__init__(self, sim_model.model) + AIMETQuantizableMixin.__init__( + self, sim_model, needs_onnx_direct_aimet_export=True + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "GoogLeNet": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + model = GoogLeNet.from_pretrained() + input_shape = model.get_input_spec()["image_tensor"][0] + + equalize_model(model, input_shape) + sim = QuantizationSimModel( + model.net, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_per_channel_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) diff --git a/qai_hub_models/models/googlenet_quantized/perf.yaml b/qai_hub_models/models/googlenet_quantized/perf.yaml new file mode 100644 index 00000000..78455c1d --- /dev/null +++ b/qai_hub_models/models/googlenet_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: GoogLeNetQuantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1026.0 + throughput: 974.6588693957115 + estimated_peak_memory_range: + min: 20480 + max: 1771688 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 183 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 183 + job_id: j2p0m2d2g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:34:34.707459Z' diff --git a/qai_hub_models/models/googlenet_quantized/test.py b/qai_hub_models/models/googlenet_quantized/test.py new file mode 100644 index 00000000..b474013c --- /dev/null +++ b/qai_hub_models/models/googlenet_quantized/test.py @@ -0,0 +1,36 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.googlenet_quantized.demo import main as demo_main +from qai_hub_models.models.googlenet_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + GoogLeNetQuantizable, +) + + +def test_task(): + run_imagenet_classifier_test( + GoogLeNetQuantizable.from_pretrained(), + MODEL_ID, + asset_version=MODEL_ASSET_VERSION, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test( + GoogLeNetQuantizable.from_pretrained(), + diff_tol=0.01, + rtol=0.02, + atol=0.2, + is_quantized=True, + ) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/hrnet_pose/README.md b/qai_hub_models/models/hrnet_pose/README.md new file mode 100644 index 00000000..bae7d57c --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [HRNetPose: Perform accurate human pose estimation](https://aihub.qualcomm.com/models/hrnet_pose) + +HRNet performs pose estimation in high-resolution representations. + +This is based on the implementation of HRNetPose found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/hrnet_posenet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/hrnet_pose). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[hrnet_pose]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.hrnet_pose.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.hrnet_pose.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of HRNetPose can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). + + +## References +* [Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1902.09212) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/hrnet_posenet) diff --git a/qai_hub_models/models/hrnet_pose/__init__.py b/qai_hub_models/models/hrnet_pose/__init__.py new file mode 100644 index 00000000..bc115143 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/__init__.py @@ -0,0 +1,4 @@ +from qai_hub_models.models.hrnet_pose.app import HRNetPoseApp # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import HRNetPose as Model # noqa: F401 diff --git a/qai_hub_models/models/hrnet_pose/app.py b/qai_hub_models/models/hrnet_pose/app.py new file mode 100644 index 00000000..a4970630 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/app.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +from typing import Callable, List, Tuple + +import numpy as np +import torch +from mmpose.apis import MMPoseInferencer +from mmpose.codecs.utils import refine_keypoints +from PIL.Image import Image, fromarray + +from qai_hub_models.utils.draw import draw_points +from qai_hub_models.utils.image_processing import app_to_net_image_inputs + +# More inferencer architectures for litehrnet can be found at +# https://github.com/open-mmlab/mmpose/tree/main/configs/body_2d_keypoint/topdown_heatmap/coco +DEFAULT_INFERENCER_ARCH = "td-hm_hrnet-w32_8xb64-210e_coco-256x192" + + +def get_max_preds(batch_heatmaps): + """ + get predictions from score maps + heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) + """ + assert isinstance( + batch_heatmaps, np.ndarray + ), "batch_heatmaps should be numpy.ndarray" + assert batch_heatmaps.ndim == 4, "batch_images should be 4-ndim" + + batch_size = batch_heatmaps.shape[0] + num_joints = batch_heatmaps.shape[1] + width = batch_heatmaps.shape[3] + heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1)) + idx = np.argmax(heatmaps_reshaped, 2) + maxvals = np.amax(heatmaps_reshaped, 2) + + maxvals = maxvals.reshape((batch_size, num_joints, 1)) + idx = idx.reshape((batch_size, num_joints, 1)) + + preds = np.tile(idx, (1, 1, 2)).astype(np.float32) + + preds[:, :, 0] = (preds[:, :, 0]) % width + preds[:, :, 1] = np.floor((preds[:, :, 1]) / width) + + pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) + pred_mask = pred_mask.astype(np.float32) + + preds *= pred_mask + return preds, maxvals + + +class HRNetPoseApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with LiteHRNet. + + The app uses 1 model: + * LiteHRNet + + For a given image input, the app will: + * pre-process the image + * Run LiteHRNet inference + * Convert the output into a list of keypoint coordiates + """ + + def __init__( + self, + model: Callable[ + [torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + ], + ): + self.model = model + # Use mmpose inferencer for example preprocessing + self.inferencer = MMPoseInferencer( + DEFAULT_INFERENCER_ARCH, device=torch.device(type="cpu") + ) + self.pre_processor = self.inferencer.inferencer.model.data_preprocessor + + def predict(self, *args, **kwargs): + # See predict_pose_keypoints. + return self.predict_pose_keypoints(*args, **kwargs) + + def predict_pose_keypoints( + self, + pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], + raw_output=False, + ) -> np.ndarray | List[Image]: + """ + Predicts pose keypoints for a person in the image. + + Parameters: + pixel_values_or_image + PIL image(s) + or + numpy array (N H W C x uint8) or (H W C x uint8) -- both RGB channel layout + or + pyTorch tensor (N C H W x fp32, value range is [0, 1]), RGB channel layout + + raw_output: bool + See "returns" doc section for details. + + Returns: + If raw_output is true, returns: + keypoints: np.ndarray, shape [B, N, 2] + Numpy array of keypoints within the images Each keypoint is an (x, y) pair of coordinates within the image. + + Otherwise, returns: + predicted_images: List[PIL.Image] + Images with keypoints drawn. + """ + # Preprocess image to get data required for post processing + NHWC_int_numpy_frames, _ = app_to_net_image_inputs(pixel_values_or_image) + inputs = self.inferencer.preprocess(NHWC_int_numpy_frames, batch_size=1) + proc_inputs, _ = list(inputs)[0] + proc_inputs_ = proc_inputs["inputs"][0] + + x = proc_inputs_[[2, 1, 0], ...] + x = (x - self.pre_processor.mean) / self.pre_processor.std + x = torch.unsqueeze(x, 0) + + # run inference + heatmaps = self.model(x) + heatmaps = heatmaps.detach().numpy() + + # create predictions from heatmap + pred_kps, scores = get_max_preds(heatmaps) + + # get the bounding box center from the preprocessing + # In older versions of the MM modules the center is directly a member + # of gt_instances and does not need to be computed. + bbox = proc_inputs["data_samples"][0].gt_instances.bboxes[0] + center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2] + + scale = proc_inputs["data_samples"][0].gt_instances.bbox_scales[0] + + # perform refinement + keypoints = refine_keypoints(pred_kps, np.squeeze(heatmaps)) + scale_factor = np.array([4.0, 4.0]) + keypoints = keypoints * scale_factor + input_size = proc_inputs["data_samples"][0].metainfo["input_size"] + keypoints = keypoints / input_size * scale + center - 0.5 * scale + keypoints = np.round(keypoints).astype(np.int32) + + if raw_output: + return keypoints + + predicted_images = [] + for i, img in enumerate(NHWC_int_numpy_frames): + draw_points(img, keypoints[i], color=(255, 0, 0), size=2) + predicted_images.append(fromarray(img)) + return predicted_images diff --git a/qai_hub_models/models/hrnet_pose/demo.py b/qai_hub_models/models/hrnet_pose/demo.py new file mode 100644 index 00000000..562347ef --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/demo.py @@ -0,0 +1,49 @@ +from qai_hub_models.models.hrnet_pose.app import HRNetPoseApp +from qai_hub_models.models.hrnet_pose.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + HRNetPose, +) +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "hrnet_pose_demo.png" +) + + +# The demo will display a image with the predicted keypoints. +def main(is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(HRNetPose) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=IMAGE_ADDRESS, + help="image file path or URL", + ) + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, HRNetPose.get_model_id()) + + # Load image & model + model = demo_model_from_cli_args(HRNetPose, args) + image = load_image(args.image) + print("Model Loaded") + + app = HRNetPoseApp(model) + keypoints = app.predict_pose_keypoints(image)[0] + if not is_test: + display_or_save_image( + keypoints, args.output_path, "hrnetpose_demo_output.png", "keypoints" + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/hrnet_pose/export.py b/qai_hub_models/models/hrnet_pose/export.py new file mode 100644 index 00000000..dc0e1e61 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/export.py @@ -0,0 +1,193 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.hrnet_pose import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "hrnet_pose" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "hrnet_pose", + "HRNetPose", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image_tensor" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/hrnet_pose/info.yaml b/qai_hub_models/models/hrnet_pose/info.yaml new file mode 100644 index 00000000..c8e701b3 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/info.yaml @@ -0,0 +1,31 @@ +name: HRNetPose +# id must match with the model dir name in qai_hub_models +id: hrnet_pose +status: public +headline: Perform accurate human pose estimation. +domain: Computer Vision +use_case: Pose Estimation +description: HRNet performs pose estimation in high-resolution representations. +tags: [] +research_paper: https://arxiv.org/abs/1902.09212 +research_paper_title: Deep High-Resolution Representation Learning for Human Pose Estimation +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/hrnet_posenet +technical_details: + Number of parameters: 28.5M + Model size: 114.2 MB + Model checkpoint: hrnet_posenet_FP32_state_dict + Input resolution: 192x256 +applicable_scenarios: + - Injury prevention training + - Sports performance analysis + - Posture recognition +form_factors: + - Phone + - Tablet + - IoT +related_models: ['litehrnet', 'openpose'] +has_static_banner: yes +has_animated_banner: no +license_type: other +dataset: [] diff --git a/qai_hub_models/models/hrnet_pose/model.py b/qai_hub_models/models/hrnet_pose/model.py new file mode 100644 index 00000000..75fa56ba --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/model.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import sys + +import torch +import torch.nn as nn + +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, SourceAsRoot +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +# This model originally comes from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch +# but we'll use the weights from AIMET +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/hrnet_posenet/models/model_cards/hrnet_posenet_w8a8.json +# Weights are found here +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_march_artifacts/hrnet_posenet_FP32_state_dict.pth +DEFAULT_WEIGHTS = "hrnet_posenet_FP32_state_dict.pth" +SOURCE_REPOSITORY = "https://github.com/leoxiaobin/deep-high-resolution-net.pytorch" +COMMIT_HASH = "6f69e4676ad8d43d0d61b64b1b9726f0c369e7b1" +CONFIG_FILE = "experiments/coco/hrnet/w32_256x192_adam_lr1e-3.yaml" + + +class HRNetPose(BaseModel): + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + @classmethod + def from_pretrained(cls) -> HRNetPose: + + weights_file = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_WEIGHTS + ).fetch() + weights = torch.load(weights_file, map_location="cpu") + with SourceAsRoot( + SOURCE_REPOSITORY, COMMIT_HASH, MODEL_ID, MODEL_ASSET_VERSION + ): + sys.path.append("./lib") + from lib.config import cfg + from models.pose_hrnet import PoseHighResolutionNet + + cfg.merge_from_file(CONFIG_FILE) + cfg.freeze() + net = PoseHighResolutionNet(cfg) + net.load_state_dict(weights) + return cls(net).eval() + + def forward(self, image: torch.Tensor): + return self.model(image) + + @staticmethod + def get_input_spec( + height: int = 256, + width: int = 192, + ) -> InputSpec: + return {"image": ((1, 3, height, width), "float32")} diff --git a/qai_hub_models/models/hrnet_pose/perf.yaml b/qai_hub_models/models/hrnet_pose/perf.yaml new file mode 100644 index 00000000..bcbe5478 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: HRNetPose + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 2574.0 + throughput: 388.5003885003885 + estimated_peak_memory_range: + min: 16384 + max: 2027656 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 515 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 515 + job_id: jwgoln14g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 2611.0 + throughput: 382.99502106472613 + estimated_peak_memory_range: + min: 12288 + max: 48352008 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 747 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 747 + job_id: j1pvlr175 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:31:51.091359Z' diff --git a/qai_hub_models/models/hrnet_pose/requirements.txt b/qai_hub_models/models/hrnet_pose/requirements.txt new file mode 100644 index 00000000..69edf6ae --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/requirements.txt @@ -0,0 +1,4 @@ +yacs==0.1.8 +mmpose<=1.2.0 +mmcv==2.1.0 +mmdet<=3.2.0 diff --git a/qai_hub_models/models/hrnet_pose/test.py b/qai_hub_models/models/hrnet_pose/test.py new file mode 100644 index 00000000..e374f2a3 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/test.py @@ -0,0 +1,39 @@ +import numpy as np + +from qai_hub_models.models.hrnet_pose.app import HRNetPoseApp +from qai_hub_models.models.hrnet_pose.demo import IMAGE_ADDRESS +from qai_hub_models.models.hrnet_pose.demo import main as demo_main +from qai_hub_models.models.hrnet_pose.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + HRNetPose, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check + +OUTPUT_IMAGE_LOCAL_PATH = "hrnetpose_output.png" +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, OUTPUT_IMAGE_LOCAL_PATH +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(IMAGE_ADDRESS) + model = HRNetPose.from_pretrained() + app = HRNetPoseApp(model=model) + output = app.predict(image)[0] + + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_close( + np.asarray(output, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + 0.005, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/hrnet_pose_quantized/README.md b/qai_hub_models/models/hrnet_pose_quantized/README.md new file mode 100644 index 00000000..b6f64114 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose_quantized/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [HRNetPoseQuantized: Perform accurate human pose estimation](https://aihub.qualcomm.com/models/hrnet_pose_quantized) + +HRNet performs pose estimation in high-resolution representations. + +This is based on the implementation of HRNetPoseQuantized found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/hrnet_posenet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/hrnet_pose_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[hrnet_pose_quantized]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.hrnet_pose_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.hrnet_pose_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of HRNetPoseQuantized can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). + + +## References +* [Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1902.09212) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/hrnet_posenet) diff --git a/qai_hub_models/models/hrnet_pose_quantized/__init__.py b/qai_hub_models/models/hrnet_pose_quantized/__init__.py new file mode 100644 index 00000000..b64da129 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose_quantized/__init__.py @@ -0,0 +1,4 @@ +from qai_hub_models.models.hrnet_pose.app import HRNetPoseApp # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import HRNetPoseQuantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/hrnet_pose_quantized/demo.py b/qai_hub_models/models/hrnet_pose_quantized/demo.py new file mode 100644 index 00000000..2d7a1a92 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose_quantized/demo.py @@ -0,0 +1,53 @@ +from qai_hub_models.models.hrnet_pose.app import HRNetPoseApp +from qai_hub_models.models.hrnet_pose_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + HRNetPoseQuantizable, +) +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "hrnet_pose_demo.png" +) + + +# The demo will display a image with the predicted keypoints. +def main(is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(HRNetPoseQuantizable) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=IMAGE_ADDRESS, + help="image file path or URL", + ) + + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, HRNetPoseQuantizable.get_model_id()) + + # Load image & model + model = demo_model_from_cli_args(HRNetPoseQuantizable, args) + image = load_image(args.image) + print("Model Loaded") + + app = HRNetPoseApp(model) + keypoints = app.predict_pose_keypoints(image)[0] + if not is_test: + display_or_save_image( + keypoints, + args.output_dir, + "hrnetpose_quantized_demo_output.png", + "keypoints", + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/hrnet_pose_quantized/export.py b/qai_hub_models/models/hrnet_pose_quantized/export.py new file mode 100644 index 00000000..0badcc48 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose_quantized/export.py @@ -0,0 +1,203 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.hrnet_pose_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "hrnet_pose_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "hrnet_pose_quantized", + "HRNetPoseQuantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image_tensor" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/hrnet_pose_quantized/info.yaml b/qai_hub_models/models/hrnet_pose_quantized/info.yaml new file mode 100644 index 00000000..dcb5487e --- /dev/null +++ b/qai_hub_models/models/hrnet_pose_quantized/info.yaml @@ -0,0 +1,32 @@ +name: HRNetPoseQuantized +# id must match with the model dir name in qai_hub_models +id: hrnet_pose_quantized +status: public +headline: Perform accurate human pose estimation. +domain: Computer Vision +use_case: Pose Estimation +description: HRNet performs pose estimation in high-resolution representations. +tags: + - quantized +research_paper: https://arxiv.org/abs/1902.09212 +research_paper_title: Deep High-Resolution Representation Learning for Human Pose Estimation +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/hrnet_posenet +technical_details: + Number of parameters: 28.5M + Model size: _ MB + Model checkpoint: hrnet_posenet_FP32_state_dict + Input resolution: 192x256 +applicable_scenarios: + - Injury prevention training + - Sports performance analysis + - Posture recognition +form_factors: + - Phone + - Tablet + - IoT +related_models: ['litehrnet', 'hrnet_pose'] +has_static_banner: yes +has_animated_banner: no +license_type: other +dataset: [] diff --git a/qai_hub_models/models/hrnet_pose_quantized/model.py b/qai_hub_models/models/hrnet_pose_quantized/model.py new file mode 100644 index 00000000..a5bd74d4 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose_quantized/model.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.hrnet_pose.model import HRNetPose +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( # isort: skip + AIMETQuantizableMixin, +) + + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/hrnet_posenet/models/model_cards/hrnet_posenet_w8a8.json: +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_march_artifacts/hrnet_posenet_W8A8_state_dict.pth +# Encodings were generated with AIMET QuantSim export +QUANTIZED_WEIGHTS = "hrnet_posenet_W8A8_state_dict.pth" +AIMET_ENCODINGS = "hrnetpose_aimet_quantization_encodings.json" +AIMET_CONFIG = "default_config_per_channel.json" + + +class HRNetPoseQuantizable(AIMETQuantizableMixin, HRNetPose): + """HRNetPose with post training quantization suport + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + hrnet_model: QuantizationSimModel, + ) -> None: + HRNetPose.__init__(self, hrnet_model.model) + AIMETQuantizableMixin.__init__( + self, hrnet_model, needs_onnx_direct_aimet_export=True + ) + + @classmethod + def from_pretrained(cls) -> HRNetPoseQuantizable: + model = HRNetPose.from_pretrained() + input_shape = HRNetPose.get_input_spec()["image"][0] + equalize_model(model, input_shape) + + weights = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS + ).fetch() + aimet_config = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, AIMET_CONFIG + ).fetch() + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS + ).fetch() + + # Load the model weights and quantization parameters + state_dict = torch.load(weights, map_location=torch.device("cpu")) + new_state_dict = {"model." + key: value for key, value in state_dict.items()} + model.load_state_dict(new_state_dict) + sim = QuantizationSimModel( + model, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=aimet_config, + dummy_input=torch.rand(input_shape), + ) + load_encodings_to_sim(sim, aimet_encodings) + + return cls(sim) diff --git a/qai_hub_models/models/hrnet_pose_quantized/perf.yaml b/qai_hub_models/models/hrnet_pose_quantized/perf.yaml new file mode 100644 index 00000000..8e720351 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: HRNetPoseQuantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 2508.0 + throughput: 398.72408293460927 + estimated_peak_memory_range: + min: 16384 + max: 3642928 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 515 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 515 + job_id: jz57eljqp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:23:32.990808Z' diff --git a/qai_hub_models/models/hrnet_pose_quantized/requirements.txt b/qai_hub_models/models/hrnet_pose_quantized/requirements.txt new file mode 100644 index 00000000..69edf6ae --- /dev/null +++ b/qai_hub_models/models/hrnet_pose_quantized/requirements.txt @@ -0,0 +1,4 @@ +yacs==0.1.8 +mmpose<=1.2.0 +mmcv==2.1.0 +mmdet<=3.2.0 diff --git a/qai_hub_models/models/hrnet_pose_quantized/test.py b/qai_hub_models/models/hrnet_pose_quantized/test.py new file mode 100644 index 00000000..2c8b86eb --- /dev/null +++ b/qai_hub_models/models/hrnet_pose_quantized/test.py @@ -0,0 +1,42 @@ +import numpy as np +import torch + +from qai_hub_models.models.hrnet_pose.app import HRNetPoseApp +from qai_hub_models.models.hrnet_pose.demo import IMAGE_ADDRESS +from qai_hub_models.models.hrnet_pose.demo import main as demo_main +from qai_hub_models.models.hrnet_pose_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + HRNetPoseQuantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check + +OUTPUT_IMAGE_LOCAL_PATH = "hrnetpose_quantized_output.png" +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, OUTPUT_IMAGE_LOCAL_PATH +) + + +@skip_clone_repo_check +def test_task(): + # AIMET Quantization Simulator introduces randomness. Eliminate that for this test. + torch.manual_seed(0) + image = load_image(IMAGE_ADDRESS) + model = HRNetPoseQuantizable.from_pretrained() + app = HRNetPoseApp(model=model) + output = app.predict(image)[0] + + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_close( + np.asarray(output, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + 0.005, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/README.md b/qai_hub_models/models/huggingface_wavlm_base_plus/README.md new file mode 100644 index 00000000..170ac687 --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [HuggingFace-WavLM-Base-Plus: Real-time Speech processing](https://aihub.qualcomm.com/models/huggingface_wavlm_base_plus) + +HuggingFaceWavLMBasePlus is a real time speech processing backbone based on Microsoft's WavLM model. + +This is based on the implementation of HuggingFace-WavLM-Base-Plus found +[here](https://huggingface.co/patrickvonplaten/wavlm-libri-clean-100h-base-plus/tree/main). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/huggingface_wavlm_base_plus). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[huggingface_wavlm_base_plus]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.huggingface_wavlm_base_plus.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.huggingface_wavlm_base_plus.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of HuggingFace-WavLM-Base-Plus can be found + [here](https://github.com/microsoft/unilm/blob/master/LICENSE). + + +## References +* [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) +* [Source Model Implementation](https://huggingface.co/patrickvonplaten/wavlm-libri-clean-100h-base-plus/tree/main) diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/__init__.py b/qai_hub_models/models/huggingface_wavlm_base_plus/__init__.py new file mode 100644 index 00000000..2e80301b --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/__init__.py @@ -0,0 +1,3 @@ +from .app import HuggingFaceWavLMBasePlusApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import HuggingFaceWavLMBasePlus as Model # noqa: F401 diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/app.py b/qai_hub_models/models/huggingface_wavlm_base_plus/app.py new file mode 100644 index 00000000..bba47cf0 --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/app.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import numpy as np +import torch + +from qai_hub_models.models.huggingface_wavlm_base_plus.model import ( + DEFAULT_INPUT_LENGTH_SECONDS, +) + + +class HuggingFaceWavLMBasePlusApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with HuggingFaceWavLMBasePlus. + + The app uses 1 model: + * HuggingFaceWavLMBasePlus + + For a given audio input, the app will: + * Run HuggingFaceWavLMBasePlus inference on the input and return the output feature vectors + """ + + def __init__(self, wavlm_model): + self.model = wavlm_model + + def predict(self, *args, **kwargs): + # See predict_features. + return self.predict_features(*args, **kwargs) + + def predict_features( + self, input: np.ndarray, sampling_rate=16000.0 + ) -> torch.Tensor: + """ + Predict a feature vector from an audio sample + + Parameters: + input: a 1xn array representing an audio sample, where n is length. + This will be clipped to the appropriate length if too long, + and padded if too short + sampling_rate: the sampling rate of the audio - default 16kHz + + Returns: + feature_vec: a tuple of tensors + 1x999x768 + 1x999x512 + features detected in the audio stream + """ + + # preprocess audio + input_len = int(DEFAULT_INPUT_LENGTH_SECONDS * sampling_rate) + x = input[:input_len] + x = torch.from_numpy(x).float() + x = torch.nn.functional.pad( + x, (0, input_len - x.shape[0]), mode="constant", value=0 + ) + audio_tensor = x.unsqueeze(0) + + # Run prediction + features = self.model(audio_tensor) + + return features diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/demo.py b/qai_hub_models/models/huggingface_wavlm_base_plus/demo.py new file mode 100644 index 00000000..8d50df07 --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/demo.py @@ -0,0 +1,41 @@ +from datasets import load_dataset + +from qai_hub_models.models.huggingface_wavlm_base_plus.app import ( + HuggingFaceWavLMBasePlusApp, +) +from qai_hub_models.models.huggingface_wavlm_base_plus.model import ( + HuggingFaceWavLMBasePlus, +) +from qai_hub_models.utils.args import get_model_cli_parser, model_from_cli_args + +HUGGINGFACE_WAVLM_DATASET = "hf-internal-testing/librispeech_asr_demo" + + +# Run HuggingFace WavLM on a sample audio input, and produce +# a feature vector from the audio. The feature vector will be printed to terminal +def demo_main(is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(HuggingFaceWavLMBasePlus) + args = parser.parse_args([] if is_test else None) + + # load model + model = model_from_cli_args(HuggingFaceWavLMBasePlus, args) + + # Load Application + app = HuggingFaceWavLMBasePlusApp(model) + + # Load audio + dataset = load_dataset(HUGGINGFACE_WAVLM_DATASET, "clean", split="validation") + audio = [x["array"] for x in dataset[:2]["audio"]][0] + + feature_vec = app.predict_features(input=audio) + + # Get output from model + if not is_test: + print("Feature vec from audio:\n") + print(feature_vec) + print("\n") + + +if __name__ == "__main__": + demo_main() diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/export.py b/qai_hub_models/models/huggingface_wavlm_base_plus/export.py new file mode 100644 index 00000000..f0e4eacd --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/export.py @@ -0,0 +1,177 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.huggingface_wavlm_base_plus import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "huggingface_wavlm_base_plus" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "huggingface_wavlm_base_plus", + "HuggingFace-WavLM-Base-Plus", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=sample_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/info.yaml b/qai_hub_models/models/huggingface_wavlm_base_plus/info.yaml new file mode 100644 index 00000000..3057f670 --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/info.yaml @@ -0,0 +1,32 @@ +name: HuggingFace-WavLM-Base-Plus +# id must match with the model dir name in qai_hub_models +id: huggingface_wavlm_base_plus +status: public +headline: Real-time Speech processing. +domain: Audio +description: HuggingFaceWavLMBasePlus is a real time speech processing backbone based on Microsoft's WavLM model. +use_case: Speech Recognition +tags: + - backbone +research_paper: https://arxiv.org/abs/2110.13900 +research_paper_title: "WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing" +license: https://github.com/microsoft/unilm/blob/master/LICENSE +source_repo: https://huggingface.co/patrickvonplaten/wavlm-libri-clean-100h-base-plus/tree/main +technical_details: + Number of parameters: 94.4M + Model size: 360 MB + Model checkpoint: wavlm-libri-clean-100h-base-plus + Input resolution: 1x320000 +applicable_scenarios: + - Smart Home + - Accessibility +form_factors: + - Phone + - Tablet + - IoT +related_models: + - whisper_asr +has_static_banner: yes +has_animated_banner: yes +license_type: mit +dataset: [] diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/model.py b/qai_hub_models/models/huggingface_wavlm_base_plus/model.py new file mode 100644 index 00000000..16fa89fb --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/model.py @@ -0,0 +1,186 @@ +from __future__ import annotations + +import math +from typing import Tuple + +import torch +from transformers import WavLMModel +from transformers.models.wavlm.modeling_wavlm import WavLMGroupNormConvLayer + +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +OPENPOSE_SOURCE_REPOSITORY = ( + "https://huggingface.co/patrickvonplaten/wavlm-libri-clean-100h-base-plus/tree/main" +) +OPENPOSE_SOURCE_REPO_COMMIT = "02c289c4471cd1ba4b0ff3e7c304afe395c5026a" +DEFAULT_WEIGHTS = "patrickvonplaten/wavlm-libri-clean-100h-base-plus" +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 + +DEFAULT_INPUT_VEC_LENGTH = 320000 +DEFAULT_INPUT_LENGTH_SECONDS = 20 + + +class HuggingFaceWavLMBasePlus(BaseModel): + """Exportable Voice Recognition model""" + + def __init__( + self, wavlm_model: torch.nn.Module, apply_apple_npu_opt: bool = False + ) -> None: + super().__init__() + + if apply_apple_npu_opt: + wavlm_model = convert_to_wavlm_npu(wavlm_model) + + self.model = wavlm_model + + @classmethod + def from_pretrained( + cls, weights_path: str | None = None, apply_apple_npu_opt: bool = False + ) -> HuggingFaceWavLMBasePlus: + """Load WavLM from a weightfile created by the source HUggingFaceWavLM repository.""" + if weights_path is None: + weights_path = "patrickvonplaten/wavlm-libri-clean-100h-base-plus" + + model = WavLMModel.from_pretrained(weights_path, torchscript=True) + + return cls(model, apply_apple_npu_opt) + + def forward(self, input: torch.Tensor) -> Tuple(torch.Tensor, torch.Tensor): + """ + Run WAvLM on `input`, and produce feature vector + + Parameters: + input: 1x320000 tensor + 20 seconds of audio sampled at 16kHz + + Returns: + feature_vec: a tuple of tensors + 1x999x768 + 1x999x512 + features detected in the audio stream + """ + return self.model(input) + + def get_input_spec( + self, + batch_size: int = 1, + sample_length: int = 80000, + ) -> InputSpec: + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"input": ((batch_size, sample_length), "float32")} + + +# Modules used to override Huggingface WavLM to be NPU friendly +class SliceConv1d(torch.nn.Module): + def __init__(self, orig_module: torch.nn.Conv1d, slice_size: int = 16000): + """Slice inputs to conv1d to limit the input size to any conv""" + super().__init__() + assert isinstance(orig_module, torch.nn.Conv1d) + self.orig_module = orig_module + self.slice_size = slice_size + + out_channels, in_channels, kernel_size_1d = orig_module.weight.shape + self.half_kernel_size = kernel_size_1d // 2 + self.stride = orig_module.stride[0] + + def forward(self, x: torch.Tensor): + num_slices = int(math.ceil(x.shape[-1] / self.slice_size)) + + xs = [] + for i in range(num_slices): + # align begin to stride boundary + begin = i * self.slice_size + begin = int(math.ceil(begin / self.stride)) * self.stride + end = min(begin + self.slice_size + self.half_kernel_size, x.shape[-1]) + conv_out = self.orig_module(x[:, :, begin:end]) + xs.append(conv_out) + return torch.concat(xs, dim=-1) + + +class WavLMGroupNormConvLayerNPU(torch.nn.Module): + def __init__(self, orig_module: WavLMGroupNormConvLayer): + """ + Apple NPU prefer spatial dim not much higher than 16000. We + wrap WavLMGroupNormConvLayer to adhere to that as much as + possible + """ + super().__init__() + assert isinstance(orig_module, WavLMGroupNormConvLayer) + self.orig_module = orig_module + # stack conv1d to conv2d to reduce input dim + conv1d = orig_module.conv + out_channels, in_channels, kernel_size_1d = conv1d.weight.shape + stride_1d = conv1d.stride[0] + self.stride_1d = stride_1d + assert kernel_size_1d % stride_1d == 0 + assert conv1d.padding == (0,) + kernel_size_2d = (stride_1d, kernel_size_1d // stride_1d) + self.conv2d = torch.nn.Conv2d( + in_channels, out_channels, kernel_size_2d, bias=conv1d.bias is not None + ) + self.conv2d.weight.data = ( + conv1d.weight.data.clone() + .view(out_channels, in_channels, kernel_size_1d // stride_1d, stride_1d) + .permute(0, 1, 3, 2) + ) + if conv1d.bias is not None: + assert self.conv2d.bias is not None # for mypy + self.conv2d.bias.data = conv1d.bias.data + self.half_kernel_size = kernel_size_2d[1] // 2 + + def forward(self, x): + # x: [1, 1, seq_len] (e.g. seq_len = 160000 for 10s audio) + seq_len = x.shape[-1] + assert seq_len % self.stride_1d == 0 + x = x.view(1, 1, seq_len // self.stride_1d, self.stride_1d).permute(0, 1, 3, 2) + # x has shape [1, 1, 5, 32000] + # divide it into segments of roughly 16000 + slice_size = 16000 + num_slices = x.shape[-1] // slice_size + xs = [] + for i in range(num_slices): + begin = i * slice_size + end = min(begin + slice_size + self.half_kernel_size, x.shape[-1]) + conv_out = self.conv2d(x[:, :, :, begin:end]) + if i == num_slices - 1: + # last slice can have 1 fewer element than previous + # slides. In order to stack it, we pad 1 + # (good apprxoimatino) + num_pad = slice_size - conv_out.shape[-1] + if num_pad > 1: + raise ValueError("Should only have 1 elem missing") + elif num_pad == 1: + conv_out = torch.nn.functional.pad(conv_out, (0, 1)) + # conv_out have shape [1, 512, 1, 16000] + xs.append(conv_out) + # x has shape [1, 512, 2, 16000] + x = torch.concat(xs, axis=2) + + # apply group norm + x = self.orig_module.layer_norm(x) + x = self.orig_module.activation(x) + x = torch.concat(torch.unbind(x, axis=2), axis=-1) + return x[:, :, :-1] + + +def convert_to_wavlm_npu(model: WavLMModel): + """ + Apply changes to make model NPU friendly + """ + assert isinstance(model, WavLMModel) + conv_layer = model.feature_extractor.conv_layers[0] + assert isinstance(conv_layer, WavLMGroupNormConvLayer) + # Replace with NPU friendly implementation + conv_layer_npu = WavLMGroupNormConvLayerNPU(conv_layer).eval() + model.feature_extractor.conv_layers[0] = conv_layer_npu + + conv_layer1 = model.feature_extractor.conv_layers[1].conv + assert isinstance(conv_layer1, torch.nn.Conv1d) + # Replace with NPU friendly implementation + conv_layer1_npu = SliceConv1d(conv_layer1).eval() + model.feature_extractor.conv_layers[1].conv = conv_layer1_npu + + return model diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml b/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml new file mode 100644 index 00000000..3391d06e --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: HuggingFace-WavLM-Base-Plus + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 463847.0 + throughput: 2.1558832977253277 + estimated_peak_memory_range: + min: 10719232 + max: 13863736 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 88 + layers_on_cpu: 748 + total_layers: 836 + job_id: jo5m06wyg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:32:02.862530Z' diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/requirements.txt b/qai_hub_models/models/huggingface_wavlm_base_plus/requirements.txt new file mode 100644 index 00000000..0e2962fb --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/requirements.txt @@ -0,0 +1,4 @@ +transformers>=4.31.0 +soundfile>=0.12.1 +librosa>=0.10.1 +datasets>=2.14.5 diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/test.py b/qai_hub_models/models/huggingface_wavlm_base_plus/test.py new file mode 100644 index 00000000..0fbbc073 --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/test.py @@ -0,0 +1,75 @@ +import numpy as np +import torch +from datasets import load_dataset + +from qai_hub_models.models.huggingface_wavlm_base_plus.app import ( + HuggingFaceWavLMBasePlusApp, +) +from qai_hub_models.models.huggingface_wavlm_base_plus.demo import demo_main +from qai_hub_models.models.huggingface_wavlm_base_plus.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + HuggingFaceWavLMBasePlus, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_TENSOR_1 = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "wavlm_output_tensor_1.pth" +) +OUTPUT_TENSOR_2 = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "wavlm_output_tensor_2.pth" +) + + +def _test_impl(app: HuggingFaceWavLMBasePlusApp): + # Load input data + dataset = load_dataset( + "hf-internal-testing/librispeech_asr_demo", "clean", split="validation" + ) + dataset = dataset.sort("id") + x = dataset[0]["audio"]["array"] + sampling_rate = dataset.features["audio"].sampling_rate + + # Load expected output data + first_output_tensor = torch.load(OUTPUT_TENSOR_1.fetch()) + output_array1 = first_output_tensor.detach().numpy() + second_output_tensor = torch.load(OUTPUT_TENSOR_2.fetch()) + output_array2 = second_output_tensor.detach().numpy() + + # Run inference + app_output_features = app.predict_features(x, sampling_rate) + + # Compare outputs + np.testing.assert_allclose( + np.asarray(app_output_features[0].detach().numpy(), dtype=np.float32), + np.asarray(output_array1, dtype=np.float32), + rtol=0.02, + atol=0.2, + ) + + np.testing.assert_allclose( + np.asarray(app_output_features[1].detach().numpy(), dtype=np.float32), + np.asarray(output_array2, dtype=np.float32), + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_task(): + _test_impl(HuggingFaceWavLMBasePlusApp(HuggingFaceWavLMBasePlus.from_pretrained())) + + +@skip_clone_repo_check +def test_trace(): + _test_impl( + HuggingFaceWavLMBasePlusApp( + HuggingFaceWavLMBasePlus.from_pretrained().convert_to_torchscript() + ) + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/inception_v3/README.md b/qai_hub_models/models/inception_v3/README.md new file mode 100644 index 00000000..af6c9e68 --- /dev/null +++ b/qai_hub_models/models/inception_v3/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Inception-v3: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/inception_v3) + +InceptionNetV3 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of Inception-v3 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/inception_v3). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.inception_v3.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.inception_v3.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Inception-v3 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Rethinking the Inception Architecture for Computer Vision](http://arxiv.org/abs/1512.00567) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py) diff --git a/qai_hub_models/models/inception_v3/__init__.py b/qai_hub_models/models/inception_v3/__init__.py new file mode 100644 index 00000000..5d4495d3 --- /dev/null +++ b/qai_hub_models/models/inception_v3/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import InceptionNetV3 as Model # noqa: F401 diff --git a/qai_hub_models/models/inception_v3/demo.py b/qai_hub_models/models/inception_v3/demo.py new file mode 100644 index 00000000..d0434818 --- /dev/null +++ b/qai_hub_models/models/inception_v3/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.inception_v3.model import InceptionNetV3 + + +def main(is_test: bool = False): + imagenet_demo(InceptionNetV3, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/inception_v3/export.py b/qai_hub_models/models/inception_v3/export.py new file mode 100644 index 00000000..5803dc1d --- /dev/null +++ b/qai_hub_models/models/inception_v3/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.inception_v3 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "inception_v3" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "inception_v3", + "Inception-v3", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/inception_v3/info.yaml b/qai_hub_models/models/inception_v3/info.yaml new file mode 100644 index 00000000..e546c133 --- /dev/null +++ b/qai_hub_models/models/inception_v3/info.yaml @@ -0,0 +1,40 @@ +name: Inception-v3 +# id must match with the model dir name in qai_hub_models +id: inception_v3 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: InceptionNetV3 is a machine learning model that can classify images from + the Imagenet dataset. It can also be used as a backbone in building more complex + models for specific use cases. +use_case: Image Classification +tags: +- backbone +research_paper: http://arxiv.org/abs/1512.00567 +research_paper_title: Rethinking the Inception Architecture for Computer Vision +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py +technical_details: + Number of parameters: 27.2M + Model size: 90.9 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: +- Medical Imaging +- Anomaly Detection +- Inventory Management +related_models: +- inception_v3_quantized +- mobilenet_v2 +- densenet121 +- googlenet +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/inception_v3/model.py b/qai_hub_models/models/inception_v3/model.py new file mode 100644 index 00000000..1b8d10be --- /dev/null +++ b/qai_hub_models/models/inception_v3/model.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class InceptionNetV3(ImagenetClassifier): + model_builder = tv_models.inception_v3 + DEFAULT_WEIGHTS = DEFAULT_WEIGHTS diff --git a/qai_hub_models/models/inception_v3/perf.yaml b/qai_hub_models/models/inception_v3/perf.yaml new file mode 100644 index 00000000..cbb39be7 --- /dev/null +++ b/qai_hub_models/models/inception_v3/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Inception-v3 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1944.0 + throughput: 514.40329218107 + estimated_peak_memory_range: + min: 24576 + max: 2564456 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 141 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 141 + job_id: j1p8em8zp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 2266.0 + throughput: 441.306266548985 + estimated_peak_memory_range: + min: 360448 + max: 133509928 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 232 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 232 + job_id: jogk2qdyg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:21:24.010787Z' diff --git a/qai_hub_models/models/inception_v3/test.py b/qai_hub_models/models/inception_v3/test.py new file mode 100644 index 00000000..b8b66fa0 --- /dev/null +++ b/qai_hub_models/models/inception_v3/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.inception_v3.demo import main as demo_main +from qai_hub_models.models.inception_v3.model import MODEL_ID, InceptionNetV3 + + +def test_task(): + run_imagenet_classifier_test(InceptionNetV3.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(InceptionNetV3.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/inception_v3_quantized/README.md b/qai_hub_models/models/inception_v3_quantized/README.md new file mode 100644 index 00000000..6198c751 --- /dev/null +++ b/qai_hub_models/models/inception_v3_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Inception-v3Quantized: Quantized Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/inception_v3_quantized) + +InceptionNetV3 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. This model is post-training quantized to int8 using samples from [Google's open images dataset](https://storage.googleapis.com/openimages/web/index.html). + +This is based on the implementation of Inception-v3Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/inception_v3_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.inception_v3_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.inception_v3_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Inception-v3Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Rethinking the Inception Architecture for Computer Vision](http://arxiv.org/abs/1512.00567) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py) diff --git a/qai_hub_models/models/inception_v3_quantized/__init__.py b/qai_hub_models/models/inception_v3_quantized/__init__.py new file mode 100644 index 00000000..a9f17377 --- /dev/null +++ b/qai_hub_models/models/inception_v3_quantized/__init__.py @@ -0,0 +1,7 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) +from qai_hub_models.models.inception_v3_quantized.model import MODEL_ID # noqa: F401 +from qai_hub_models.models.inception_v3_quantized.model import ( # noqa: F401 + InceptionNetV3Quantizable as Model, +) diff --git a/qai_hub_models/models/inception_v3_quantized/demo.py b/qai_hub_models/models/inception_v3_quantized/demo.py new file mode 100644 index 00000000..50d3fdf9 --- /dev/null +++ b/qai_hub_models/models/inception_v3_quantized/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.inception_v3_quantized.model import InceptionNetV3Quantizable + + +def main(is_test: bool = False): + imagenet_demo(InceptionNetV3Quantizable, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/inception_v3_quantized/export.py b/qai_hub_models/models/inception_v3_quantized/export.py new file mode 100644 index 00000000..1d2aa183 --- /dev/null +++ b/qai_hub_models/models/inception_v3_quantized/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.inception_v3_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "inception_v3_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "inception_v3_quantized", + "Inception-v3Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/inception_v3_quantized/info.yaml b/qai_hub_models/models/inception_v3_quantized/info.yaml new file mode 100644 index 00000000..b2580041 --- /dev/null +++ b/qai_hub_models/models/inception_v3_quantized/info.yaml @@ -0,0 +1,40 @@ +name: Inception-v3Quantized +# id must match with the model dir name in qai_hub_models +id: inception_v3_quantized +status: public +headline: Quantized Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: InceptionNetV3 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. This model is post-training quantized to int8 using samples from [Google's open images dataset](https://storage.googleapis.com/openimages/web/index.html). +use_case: Image Classification +tags: + - backbone + - quantized +research_paper: http://arxiv.org/abs/1512.00567 +research_paper_title: Rethinking the Inception Architecture for Computer Vision +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py +technical_details: + Number of parameters: 27.2M + Model size: 104 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - inception_v3 + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: no +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/inception_v3_quantized/model.py b/qai_hub_models/models/inception_v3_quantized/model.py new file mode 100644 index 00000000..b1e10928 --- /dev/null +++ b/qai_hub_models/models/inception_v3_quantized/model.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.inception_v3.model import InceptionNetV3 +from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 3 +DEFAULT_ENCODINGS = "inception_v3_quantized_encodings.json" + + +class InceptionNetV3Quantizable(AIMETQuantizableMixin, InceptionNetV3): + """InceptionNetV3 with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + sim_model: QuantizationSimModel, + ) -> None: + InceptionNetV3.__init__(self, sim_model.model) + AIMETQuantizableMixin.__init__( + self, sim_model, needs_onnx_direct_aimet_export=True + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "InceptionNetV3": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + model = InceptionNetV3.from_pretrained() + input_shape = model.get_input_spec()["image_tensor"][0] + + equalize_model(model, input_shape) + sim = QuantizationSimModel( + model.net, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_per_channel_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) diff --git a/qai_hub_models/models/inception_v3_quantized/perf.yaml b/qai_hub_models/models/inception_v3_quantized/perf.yaml new file mode 100644 index 00000000..5d7edfbb --- /dev/null +++ b/qai_hub_models/models/inception_v3_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Inception-v3Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1772.0 + throughput: 564.3340857787811 + estimated_peak_memory_range: + min: 28672 + max: 2456712 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 394 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 394 + job_id: jep2r9jxg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:37:27.025184Z' diff --git a/qai_hub_models/models/inception_v3_quantized/test.py b/qai_hub_models/models/inception_v3_quantized/test.py new file mode 100644 index 00000000..0b1f5dfb --- /dev/null +++ b/qai_hub_models/models/inception_v3_quantized/test.py @@ -0,0 +1,36 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.inception_v3_quantized.demo import main as demo_main +from qai_hub_models.models.inception_v3_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + InceptionNetV3Quantizable, +) + + +def test_task(): + run_imagenet_classifier_test( + InceptionNetV3Quantizable.from_pretrained(), + MODEL_ID, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + asset_version=MODEL_ASSET_VERSION, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test( + InceptionNetV3Quantizable.from_pretrained(), + diff_tol=0.01, + rtol=0.02, + atol=0.2, + is_quantized=True, + ) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/lama_dilated/README.md b/qai_hub_models/models/lama_dilated/README.md new file mode 100644 index 00000000..28f6ea2e --- /dev/null +++ b/qai_hub_models/models/lama_dilated/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [LaMa-Dilated: High resolution image in-painting on-device](https://aihub.qualcomm.com/models/lama_dilated) + +LaMa-Dilated is a machine learning model that allows to erase and in-paint part of given input image. + +This is based on the implementation of LaMa-Dilated found +[here](https://github.com/advimman/lama). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/lama_dilated). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[lama_dilated]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.lama_dilated.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.lama_dilated.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of LaMa-Dilated can be found + [here](https://github.com/advimman/lama/blob/main/LICENSE). + + +## References +* [Resolution-robust Large Mask Inpainting with Fourier Convolutions](https://arxiv.org/abs/2109.07161) +* [Source Model Implementation](https://github.com/advimman/lama) diff --git a/qai_hub_models/models/lama_dilated/__init__.py b/qai_hub_models/models/lama_dilated/__init__.py new file mode 100644 index 00000000..414dc9e0 --- /dev/null +++ b/qai_hub_models/models/lama_dilated/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.repaint.app import ( # noqa: F401 + RepaintMaskApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import LamaDilated as Model # noqa: F401 diff --git a/qai_hub_models/models/lama_dilated/demo.py b/qai_hub_models/models/lama_dilated/demo.py new file mode 100644 index 00000000..190a43ac --- /dev/null +++ b/qai_hub_models/models/lama_dilated/demo.py @@ -0,0 +1,22 @@ +from qai_hub_models.models._shared.repaint.demo import repaint_demo +from qai_hub_models.models.lama_dilated.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + LamaDilated, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/test_input_image.png" +) +MASK_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/test_input_mask.png" +) + + +def main(is_test: bool = False): + repaint_demo(LamaDilated, IMAGE_ADDRESS, MASK_ADDRESS, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/lama_dilated/export.py b/qai_hub_models/models/lama_dilated/export.py new file mode 100644 index 00000000..ace152ad --- /dev/null +++ b/qai_hub_models/models/lama_dilated/export.py @@ -0,0 +1,193 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.lama_dilated import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "lama_dilated" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "lama_dilated", + "LaMa-Dilated", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image,mask" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image,mask", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/lama_dilated/info.yaml b/qai_hub_models/models/lama_dilated/info.yaml new file mode 100644 index 00000000..3964e67d --- /dev/null +++ b/qai_hub_models/models/lama_dilated/info.yaml @@ -0,0 +1,30 @@ +name: LaMa-Dilated +# id must match with the model dir name in qai_hub_models +id: lama_dilated +status: public +headline: High resolution image in-painting on-device. +domain: Computer Vision +description: LaMa-Dilated is a machine learning model that allows to erase and in-paint part of given input image. +use_case: Image Editing +tags: + - backbone +research_paper: https://arxiv.org/abs/2109.07161 +research_paper_title: Resolution-robust Large Mask Inpainting with Fourier Convolutions +license: https://github.com/advimman/lama/blob/main/LICENSE +source_repo: https://github.com/advimman/lama +technical_details: + Number of parameters: 45.6M + Model size: 370MB + Model checkpoint: Dilated CelebAHQ + Input resolution: 512x512 +applicable_scenarios: + - Image editing +related_models: + - 'aotgan' +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: [] diff --git a/qai_hub_models/models/lama_dilated/model.py b/qai_hub_models/models/lama_dilated/model.py new file mode 100644 index 00000000..d98d1be7 --- /dev/null +++ b/qai_hub_models/models/lama_dilated/model.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +import torch +from omegaconf import OmegaConf + +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + SourceAsRoot, + load_json, + load_torch, +) +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +LAMA_SOURCE_REPOSITORY = "https://github.com/advimman/lama" +LAMA_SOURCE_REPO_COMMIT = "7dee0e4a3cf5f73f86a820674bf471454f52b74f" +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_WEIGHTS = "lama-dilated_celeba-hq" +MODEL_ASSET_VERSION = 1 + + +class LamaDilated(BaseModel): + """Exportable LamaDilated inpainting algorithm by Samsung Research.""" + + def __init__( + self, + lama_dilated_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = lama_dilated_model + + @staticmethod + def from_pretrained(weights_name: str = DEFAULT_WEIGHTS) -> LamaDilated: + """Load LamaDilated from a weights file created by the source LaMa repository.""" + + # Load PyTorch model from disk + lama_dilated_model = _load_lama_dilated_source_model_from_weights(weights_name) + + return LamaDilated(lama_dilated_model) + + def forward(self, image: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + """ + Run LamaDilated on `image` and `mask`, and produce an image with mask area inpainted. + + Parameters: + image: Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + mask: Pixel values pre-processed to have have mask values either 0. or 1. + Range: float[0, 1] and only values of 0. or 1. + 1-channel binary image. + + Returns: + inpainted_image: Pixel values + Range: float[0, 1] + 3-channel Color Space: RGB + """ + + masked_img = image * (1 - mask) + + if self.model.concat_mask: + masked_img = torch.cat([masked_img, mask], dim=1) + + predicted_image = self.model.generator(masked_img) + inpainted = mask * predicted_image + (1 - mask) * image + return inpainted + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 512, + width: int = 512, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return { + "image": ((batch_size, num_channels, height, width), "float32"), + "mask": ((batch_size, 1, height, width), "float32"), + } + + +def _get_weightsfile_from_name(weights_name: str): + """Convert from names of weights files to the url for the weights file""" + return CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, f"checkpoints/{weights_name}.ckpt" + ) + + +def _get_config_url(): + """Get the url for the config file""" + return CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "checkpoints/training_config.json" + ) + + +def _load_lama_dilated_source_model_from_weights(weights_name: str) -> torch.nn.Module: + # Load LamaDilated model from the source repository using the given weights. + weights_url = _get_weightsfile_from_name(weights_name) + config_url = _get_config_url() + + with SourceAsRoot( + LAMA_SOURCE_REPOSITORY, LAMA_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION + ): + # Import module + from saicinpainting.training.trainers.default import ( + DefaultInpaintingTrainingModule, + ) + + # Pass config as needed to create the module for tracing. + config = load_json(config_url) + config = OmegaConf.create(config) + kwargs = dict(config.training_model) + kwargs.pop("kind") + kwargs["use_ddp"] = True + state = load_torch(weights_url) + lama_dilated_model = DefaultInpaintingTrainingModule(config, **kwargs) + lama_dilated_model.load_state_dict(state["state_dict"], strict=False) + lama_dilated_model.on_load_checkpoint(state) + lama_dilated_model.freeze() + return lama_dilated_model diff --git a/qai_hub_models/models/lama_dilated/perf.yaml b/qai_hub_models/models/lama_dilated/perf.yaml new file mode 100644 index 00000000..f951db3e --- /dev/null +++ b/qai_hub_models/models/lama_dilated/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: LaMa-Dilated + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 88596.0 + throughput: 11.287191295318072 + estimated_peak_memory_range: + min: 3289088 + max: 139215624 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 346 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 346 + job_id: jqpyojvr5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 84076.0 + throughput: 11.894000666064038 + estimated_peak_memory_range: + min: 4313088 + max: 34733320 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 333 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 333 + job_id: j2p0m2e2g + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:27:42.653097Z' diff --git a/qai_hub_models/models/lama_dilated/requirements.txt b/qai_hub_models/models/lama_dilated/requirements.txt new file mode 100644 index 00000000..a21b654f --- /dev/null +++ b/qai_hub_models/models/lama_dilated/requirements.txt @@ -0,0 +1,10 @@ +matplotlib +pandas +albumentations==0.5.2 +pytorch-lightning==1.6.0 +webdataset +easydict==1.10 +kornia==0.5.0 +hydra-core==1.3.0 +omegaconf==2.3.0 +scikit-learn==1.3.0 diff --git a/qai_hub_models/models/lama_dilated/test.py b/qai_hub_models/models/lama_dilated/test.py new file mode 100644 index 00000000..b7e78c16 --- /dev/null +++ b/qai_hub_models/models/lama_dilated/test.py @@ -0,0 +1,61 @@ +import numpy as np + +from qai_hub_models.models._shared.repaint.app import RepaintMaskApp +from qai_hub_models.models.lama_dilated.demo import IMAGE_ADDRESS, MASK_ADDRESS +from qai_hub_models.models.lama_dilated.demo import main as demo_main +from qai_hub_models.models.lama_dilated.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + LamaDilated, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check + +OUTPUT_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/test_output.png" +) + + +@skip_clone_repo_check +def test_task(): + app = RepaintMaskApp(LamaDilated.from_pretrained()) + + img = load_image(IMAGE_ADDRESS) + mask_image = load_image(MASK_ADDRESS) + + out_img = app.paint_mask_on_image(img, mask_image) + expected_out = load_image(OUTPUT_ADDRESS) + assert_most_close( + np.asarray(out_img[0], dtype=np.float32), + np.asarray(expected_out, dtype=np.float32), + 0.005, + rtol=0.02, + atol=1.5, + ) + + +@skip_clone_repo_check +def test_trace(): + net = LamaDilated.from_pretrained() + input_spec = net.get_input_spec() + trace = net.convert_to_torchscript(input_spec) + + img = load_image(IMAGE_ADDRESS) + mask_image = load_image(MASK_ADDRESS) + app = RepaintMaskApp(trace) + + out_imgs = app.paint_mask_on_image(img, mask_image) + expected_out = load_image(OUTPUT_ADDRESS) + assert_most_close( + np.asarray(out_imgs[0], dtype=np.float32), + np.asarray(expected_out, dtype=np.float32), + 0.005, + rtol=0.02, + atol=1.5, + ) + + +@skip_clone_repo_check +def test_demo(): + # Run demo and verify it does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/litehrnet/README.md b/qai_hub_models/models/litehrnet/README.md new file mode 100644 index 00000000..fe566187 --- /dev/null +++ b/qai_hub_models/models/litehrnet/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [LiteHRNet: Human pose estimation](https://aihub.qualcomm.com/models/litehrnet) + +LiteHRNet is a machine learning model that detects human pose and returns a location and confidence for each of 17 joints. + +This is based on the implementation of LiteHRNet found +[here](https://github.com/HRNet/Lite-HRNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/litehrnet). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[litehrnet]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.litehrnet.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.litehrnet.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of LiteHRNet can be found + [here](https://github.com/HRNet/Lite-HRNet/blob/hrnet/LICENSE). + + +## References +* [Lite-HRNet: A Lightweight High-Resolution Network](https://arxiv.org/abs/2104.06403) +* [Source Model Implementation](https://github.com/HRNet/Lite-HRNet) diff --git a/qai_hub_models/models/litehrnet/__init__.py b/qai_hub_models/models/litehrnet/__init__.py new file mode 100644 index 00000000..f2acce51 --- /dev/null +++ b/qai_hub_models/models/litehrnet/__init__.py @@ -0,0 +1,3 @@ +from .app import LiteHRNetApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import LiteHRNet as Model # noqa: F401 diff --git a/qai_hub_models/models/litehrnet/app.py b/qai_hub_models/models/litehrnet/app.py new file mode 100644 index 00000000..7cfe0323 --- /dev/null +++ b/qai_hub_models/models/litehrnet/app.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from typing import Any, Callable, List, Tuple + +import numpy as np +import torch +from mmpose.codecs.utils import refine_keypoints +from PIL.Image import Image, fromarray + +from qai_hub_models.utils.draw import draw_points +from qai_hub_models.utils.image_processing import app_to_net_image_inputs + + +class LiteHRNetApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with LiteHRNet. + + The app uses 1 model: + * LiteHRNet + + For a given image input, the app will: + * pre-process the image + * Run LiteHRNet inference + * Convert the output into a list of keypoint coordiates + """ + + def __init__( + self, + model: Callable[ + [torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + ], + inferencer: Any, + ): + self.inferencer = inferencer + self.model = model + + def predict(self, *args, **kwargs): + # See predict_pose_keypoints. + return self.predict_pose_keypoints(*args, **kwargs) + + def predict_pose_keypoints( + self, + pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], + raw_output=False, + ) -> np.ndarray | List[Image]: + """ + Predicts pose keypoints for a person in the image. + + Parameters: + pixel_values_or_image + PIL image(s) + or + numpy array (N H W C x uint8) or (H W C x uint8) -- both RGB channel layout + or + pyTorch tensor (N C H W x fp32, value range is [0, 1]), RGB channel layout + + raw_output: bool + See "returns" doc section for details. + + Returns: + If raw_output is true, returns: + keypoints: np.ndarray, shape [B, N, 2] + Numpy array of keypoints within the images Each keypoint is an (x, y) pair of coordinates within the image. + + Otherwise, returns: + predicted_images: List[PIL.Image] + Images with keypoints drawn. + """ + # Preprocess image to get data required for post processing + NHWC_int_numpy_frames, _ = app_to_net_image_inputs(pixel_values_or_image) + inputs = self.inferencer.preprocess(NHWC_int_numpy_frames, batch_size=1) + proc_inputs, _ = list(inputs)[0] + proc_inputs_ = proc_inputs["inputs"][0] + + # run inference + input = proc_inputs_.to(torch.float32) + predictions, _, heatmaps = self.model(input) + + # get the bounding box center from the preprocessing + # In older versions of the MM modules the center is directly a member + # of gt_instances and does not need to be computed. + bbox = proc_inputs["data_samples"][0].gt_instances.bboxes[0] + center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2] + + scale = proc_inputs["data_samples"][0].gt_instances.bbox_scales[0] + + # perform refinement + keypoints = refine_keypoints( + predictions.unsqueeze(0).detach().numpy(), heatmaps.detach().numpy() + ) + scale_factor = np.array([4.0, 4.0]) + keypoints = keypoints * scale_factor + input_size = proc_inputs["data_samples"][0].metainfo["input_size"] + keypoints = keypoints / input_size * scale + center - 0.5 * scale + keypoints = np.round(keypoints).astype(np.int32) + + if raw_output: + return keypoints + + predicted_images = [] + for i, img in enumerate(NHWC_int_numpy_frames): + draw_points(img, keypoints[i], color=(255, 0, 0), size=2) + predicted_images.append(fromarray(img)) + return predicted_images diff --git a/qai_hub_models/models/litehrnet/demo.py b/qai_hub_models/models/litehrnet/demo.py new file mode 100644 index 00000000..9157a780 --- /dev/null +++ b/qai_hub_models/models/litehrnet/demo.py @@ -0,0 +1,52 @@ +from qai_hub_models.models.litehrnet.app import LiteHRNetApp +from qai_hub_models.models.litehrnet.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + LiteHRNet, +) +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + model_from_cli_args, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + +IA_HELP_MSG = "More inferencer architectures for litehrnet can be found at https://github.com/open-mmlab/mmpose/tree/main/configs/body_2d_keypoint/topdown_heatmap/coco" +IMAGE_LOCAL_PATH = "litehrnet_demo.png" +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, IMAGE_LOCAL_PATH +) + + +# Run LiteHRNet end-to-end on a sample image. +# The demo will display a image with the predicted keypoints. +def main(is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(LiteHRNet) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=IMAGE_ADDRESS, + help="image file path or URL", + ) + args = parser.parse_args([] if is_test else None) + litehrnet_model = model_from_cli_args(LiteHRNet, args) + hub_model = demo_model_from_cli_args(LiteHRNet, args) + validate_on_device_demo_args(args, LiteHRNet.get_model_id()) + + # Load image & model + image = load_image(args.image) + print("Model Loaded") + + app = LiteHRNetApp(hub_model, litehrnet_model.inferencer) + keypoints = app.predict_pose_keypoints(image)[0] + if not is_test: + display_or_save_image(keypoints, args.output_dir, "litehrnet_demo_output.png") + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/litehrnet/export.py b/qai_hub_models/models/litehrnet/export.py new file mode 100644 index 00000000..94ea8287 --- /dev/null +++ b/qai_hub_models/models/litehrnet/export.py @@ -0,0 +1,180 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.litehrnet import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "litehrnet" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "litehrnet", + "LiteHRNet", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=sample_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/litehrnet/info.yaml b/qai_hub_models/models/litehrnet/info.yaml new file mode 100644 index 00000000..0918a5bd --- /dev/null +++ b/qai_hub_models/models/litehrnet/info.yaml @@ -0,0 +1,30 @@ +name: LiteHRNet +# id must match with the model dir name in qai_hub_models +id: litehrnet +status: public +headline: Human pose estimation. +domain: Computer Vision +description: LiteHRNet is a machine learning model that detects human pose and returns a location and confidence for each of 17 joints. +use_case: Pose Estimation +tags: [] +research_paper: https://arxiv.org/abs/2104.06403 +research_paper_title: "Lite-HRNet: A Lightweight High-Resolution Network" +license: https://github.com/HRNet/Lite-HRNet/blob/hrnet/LICENSE +source_repo: https://github.com/HRNet/Lite-HRNet +technical_details: + Number of parameters: 1.3M + Model size: 5.11 MB + Input resolution: 256x192 +applicable_scenarios: + - Injury prevention training + - Sports performance analysis + - Posture recognition +form_factors: + - Phone + - Tablet + - IoT +related_models: ['openpose', 'hrnet_pose'] +has_static_banner: yes +has_animated_banner: no +license_type: apache-2.0 +dataset: [] diff --git a/qai_hub_models/models/litehrnet/model.py b/qai_hub_models/models/litehrnet/model.py new file mode 100644 index 00000000..667395fe --- /dev/null +++ b/qai_hub_models/models/litehrnet/model.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from typing import Tuple + +import torch +from mmpose.apis import MMPoseInferencer + +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 + +# More inferencer architectures for litehrnet can be found here +# https://github.com/open-mmlab/mmpose/tree/main/configs/body_2d_keypoint/topdown_heatmap/coco +DEFAULT_INFERENCER_ARCH = "td-hm_litehrnet-18_8xb64-210e_coco-256x192" + + +class LiteHRNet(BaseModel): + """Exportable LiteHRNet pose joint detector, end-to-end.""" + + def __init__(self, inferencer) -> None: + super().__init__() + + self.inferencer = inferencer + self.model = self.inferencer.inferencer.model + self.pre_processor = self.inferencer.inferencer.model.data_preprocessor + self.H, self.W = self.inferencer.inferencer.model.head.decoder.heatmap_size + self.K = self.inferencer.inferencer.model.head.out_channels + self.B = 1 + + @classmethod + def from_pretrained(cls, inferencer_arch=DEFAULT_INFERENCER_ARCH) -> LiteHRNet: + """LiteHRNet comes from the MMPose library, so we load using an internal config + rather than a public weights file""" + inferencer = MMPoseInferencer(inferencer_arch, device=torch.device(type="cpu")) + return cls(inferencer) + + def forward( + self, image: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Run LiteHRNet on `image`, and produce an upscaled image + + Parameters: + image: Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + keypoints: 1x17x2 array of coordinate pairs (in x,y format) denoting joint keypoints in the original image + scores: 1x17 array of float[0,1] denoting the score of each corresponding keypoint + heatmaps: 1x17 array of 64x48 heatmaps. These hold the raw confidence values of the locations + of each joint in the image. The keypoints and scores are derived from this + """ + # Preprocess + x = image[[2, 1, 0], ...] + x = (x - self.pre_processor.mean) / self.pre_processor.std + x = torch.unsqueeze(x, 0) + + # Model prediction + heatmaps = self.model._forward(x) + + # Convert from heatmap to keypoints and scores + # heatmap is 1 x 17 x 64 x 48, BxKxHxW + heatmaps = torch.squeeze(heatmaps) + heatmaps_flatten = heatmaps.flatten(1) + indices = torch.argmax(heatmaps_flatten, dim=1) + # get the (x,y) coords of the maxes in the original heatmap shape - (H, W) + y_locs = (indices // self.H).type(torch.float32) + x_locs = (indices % self.H).type(torch.float32) + + # get the max scores and corresponding keypoints + scores, _ = torch.max(heatmaps_flatten, dim=1) + keypoints = torch.stack((x_locs, y_locs), dim=-1) + + return keypoints, scores, heatmaps + + def get_input_spec( + self, + num_channels: int = 3, + height: int = 256, + width: int = 192, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((num_channels, height, width), "float32")} diff --git a/qai_hub_models/models/litehrnet/perf.yaml b/qai_hub_models/models/litehrnet/perf.yaml new file mode 100644 index 00000000..2b7cc7c9 --- /dev/null +++ b/qai_hub_models/models/litehrnet/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: LiteHRNet + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 15966.0 + throughput: 62.63309532757109 + estimated_peak_memory_range: + min: 6561792 + max: 13503904 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1226 + layers_on_gpu: 0 + layers_on_cpu: 10 + total_layers: 1236 + job_id: jqp4ydwqp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:25:31.033915Z' diff --git a/qai_hub_models/models/litehrnet/requirements.txt b/qai_hub_models/models/litehrnet/requirements.txt new file mode 100644 index 00000000..048feb99 --- /dev/null +++ b/qai_hub_models/models/litehrnet/requirements.txt @@ -0,0 +1,3 @@ +mmpose<=1.2.0 +mmcv==2.1.0 +mmdet<=3.2.0 diff --git a/qai_hub_models/models/litehrnet/test.py b/qai_hub_models/models/litehrnet/test.py new file mode 100644 index 00000000..b7123b4e --- /dev/null +++ b/qai_hub_models/models/litehrnet/test.py @@ -0,0 +1,57 @@ +import numpy as np + +from qai_hub_models.models.litehrnet.app import LiteHRNetApp +from qai_hub_models.models.litehrnet.demo import IMAGE_ADDRESS +from qai_hub_models.models.litehrnet.demo import main as demo_main +from qai_hub_models.models.litehrnet.model import LiteHRNet +from qai_hub_models.utils.asset_loaders import load_image + +EXPECTED_KEYPOINTS = np.array( + [ + [ + [70, 34], + [77, 32], + [72, 30], + [91, 37], + [72, 32], + [109, 67], + [67, 67], + [130, 104], + [63, 104], + [112, 125], + [40, 102], + [105, 144], + [77, 144], + [119, 202], + [81, 190], + [142, 251], + [88, 230], + ] + ] +) + + +def _test_impl(app: LiteHRNetApp): + image = load_image(IMAGE_ADDRESS) + keypoints = app.predict_pose_keypoints(image, True) + + np.testing.assert_allclose( + np.asarray(EXPECTED_KEYPOINTS, dtype=np.float32), + np.asarray(keypoints, dtype=np.float32), + rtol=0.02, + atol=1.5, + ) + + +def test_task(): + litehrnet = LiteHRNet.from_pretrained() + _test_impl(LiteHRNetApp(litehrnet, litehrnet.inferencer)) + + +def test_trace(): + litehrnet = LiteHRNet.from_pretrained() + _test_impl(LiteHRNetApp(litehrnet.convert_to_torchscript(), litehrnet.inferencer)) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/mediapipe_face/README.md b/qai_hub_models/models/mediapipe_face/README.md new file mode 100644 index 00000000..01c85c99 --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [MediaPipe-Face-Detection: Detect faces and locate facial features in real-time video and image streams](https://aihub.qualcomm.com/models/mediapipe_face) + +Designed for sub-millisecond processing, this model predicts bounding boxes and pose skeletons (left eye, right eye, nose tip, mouth, left eye tragion, and right eye tragion) of faces in an image. + +This is based on the implementation of MediaPipe-Face-Detection found +[here](https://github.com/zmurez/MediaPipePyTorch/). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/mediapipe_face). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[mediapipe_face]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.mediapipe_face.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.mediapipe_face.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of MediaPipe-Face-Detection can be found + [here](https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE). + + +## References +* [BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs](https://arxiv.org/abs/1907.05047) +* [Source Model Implementation](https://github.com/zmurez/MediaPipePyTorch/) diff --git a/qai_hub_models/models/mediapipe_face/__init__.py b/qai_hub_models/models/mediapipe_face/__init__.py new file mode 100644 index 00000000..1476523e --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/__init__.py @@ -0,0 +1,3 @@ +from .app import MediaPipeFaceApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import MediaPipeFace as Model # noqa: F401 diff --git a/qai_hub_models/models/mediapipe_face/app.py b/qai_hub_models/models/mediapipe_face/app.py new file mode 100644 index 00000000..42181274 --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/app.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from qai_hub_models.models._shared.mediapipe.app import MediaPipeApp +from qai_hub_models.models.mediapipe_face.model import ( + DETECT_DSCALE, + DETECT_DXY, + DETECT_SCORE_SLIPPING_THRESHOLD, + FACE_LANDMARK_CONNECTIONS, + LEFT_EYE_KEYPOINT_INDEX, + RIGHT_EYE_KEYPOINT_INDEX, + ROTATION_VECTOR_OFFSET_RADS, + MediaPipeFace, +) + + +class MediaPipeFaceApp(MediaPipeApp): + """ + This class consists of light-weight "app code" that is required to perform end to end inference with MediaPipe's hand landmark detector. + + The app uses 2 models: + * MediaPipeFaceDetector + * MediaPipeFaceLandmark + + See the class comment for the parent class for details. + """ + + def __init__( + self, + model: MediaPipeFace, + min_detector_face_box_score: float = 0.75, + nms_iou_threshold: float = 0.3, + min_landmark_score: float = 0.5, + ): + """ + Construct a mediapipe face application. + + Inputs: + model: MediaPipeFace model + Face detection & landmark model container. + + See parent initializer for further parameter documentation. + """ + super().__init__( + model.face_detector, + model.face_detector.anchors, + model.face_landmark_detector, + model.face_detector.get_input_spec()["image"][0][-2:], + model.face_landmark_detector.get_input_spec()["image"][0][-2:], + RIGHT_EYE_KEYPOINT_INDEX, + LEFT_EYE_KEYPOINT_INDEX, + ROTATION_VECTOR_OFFSET_RADS, + DETECT_DXY, + DETECT_DSCALE, + min_detector_face_box_score, + DETECT_SCORE_SLIPPING_THRESHOLD, + nms_iou_threshold, + min_landmark_score, + FACE_LANDMARK_CONNECTIONS, + ) diff --git a/qai_hub_models/models/mediapipe_face/demo.py b/qai_hub_models/models/mediapipe_face/demo.py new file mode 100644 index 00000000..89aa4850 --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/demo.py @@ -0,0 +1,87 @@ +import argparse + +import numpy as np +from PIL import Image + +from qai_hub_models.models.mediapipe_face.app import MediaPipeFaceApp +from qai_hub_models.models.mediapipe_face.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + MediaPipeFace, +) +from qai_hub_models.utils.args import add_output_dir_arg +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.camera_capture import capture_and_display_processed_frames +from qai_hub_models.utils.display import display_or_save_image + +INPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "face.jpeg" +) + + +# Run Mediapipe Face landmark detection end-to-end on a sample image or camera stream. +# The demo will display output with the predicted landmarks & bounding boxes drawn. +def main(is_test: bool = False): + # Demo parameters + parser = argparse.ArgumentParser() + parser.add_argument( + "--image", + type=str, + default=None, + help="image file path or URL", + ) + parser.add_argument( + "--camera", + type=int, + default=0, + help="Camera Input ID", + ) + parser.add_argument( + "--score-threshold", + type=float, + default=0.75, + help="Score threshold for NonMaximumSuppression", + ) + parser.add_argument( + "--iou-threshold", + type=float, + default=0.3, + help="Intersection over Union (IoU) threshold for NonMaximumSuppression", + ) + add_output_dir_arg(parser) + + print( + "Note: This readme is running through torch, and not meant to be real-time without dedicated ML hardware." + ) + print("Use Ctrl+C in your terminal to exit.") + + args = parser.parse_args([] if is_test else None) + if is_test: + args.image = INPUT_IMAGE_ADDRESS + + # Load app + app = MediaPipeFaceApp( + MediaPipeFace.from_pretrained(), + args.score_threshold, + args.iou_threshold, + ) + print("Model and App Loaded") + + if args.image: + image = load_image(args.image).convert("RGB") + pred_image = app.predict_landmarks_from_image(image) + out_image = Image.fromarray(pred_image[0], "RGB") + if not is_test: + display_or_save_image(out_image, args.output_dir) + else: + + def frame_processor(frame: np.ndarray) -> np.ndarray: + return app.predict_landmarks_from_image(frame)[0] # type: ignore + + capture_and_display_processed_frames( + frame_processor, "QAIHM Mediapipe Face Demo", args.camera + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mediapipe_face/export.py b/qai_hub_models/models/mediapipe_face/export.py new file mode 100644 index 00000000..c09dc5d7 --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/export.py @@ -0,0 +1,215 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Mapping, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.mediapipe_face import Model +from qai_hub_models.utils.args import ( + export_parser, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + +ALL_COMPONENTS = ["MediaPipeFaceDetector", "MediaPipeFaceLandmarkDetector"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[ + str, Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] +] | List[str]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` + + Returns: + A Mapping from component_name to a 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "mediapipe_face" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + component_arg = components + components = components or ALL_COMPONENTS + for component in components: + if component not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "mediapipe_face", + "MediaPipe-Face-Detection", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + component_arg, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + components_dict = {} + if "MediaPipeFaceDetector" in components: + components_dict["MediaPipeFaceDetector"] = model.face_detector + if "MediaPipeFaceLandmarkDetector" in components: + components_dict["MediaPipeFaceLandmarkDetector"] = model.face_landmark_detector + + compile_jobs = {} + for component_name, component in components_dict.items(): + # Trace the model + input_spec = component.get_input_spec() + source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + + # 2. Compile the models to an on-device asset + model_compile_options = component.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {component_name} to run on-device.") + compile_jobs[component_name] = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=f"{component_name}", + options=model_compile_options, + ) + + # 3. Profile the model assets on real devices + profile_jobs = {} + if not skip_profiling: + for component_name in components: + print(f"Profiling model {component_name} on a hosted device.") + profile_jobs[component_name] = hub.submit_profile_job( + model=compile_jobs[component_name].get_target_model(), + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_jobs = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + sample_inputs = components_dict[component_name].sample_inputs() + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_jobs[component_name] = hub.submit_inference_job( + model=compile_jobs[component_name].get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 5. Download the model assets to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + for component_name, compile_job in compile_jobs.items(): + target_model = compile_job.get_target_model() + target_model.download( + str(output_path / f"{model_name}_{component_name}.tflite") + ) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + for component_name in components: + inference_job = inference_jobs[component_name] + sample_inputs = components_dict[component_name].sample_inputs() + torch_out = torch_inference(components_dict[component_name], sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return { + component_name: ( + compile_jobs[component_name], + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mediapipe_face/info.yaml b/qai_hub_models/models/mediapipe_face/info.yaml new file mode 100644 index 00000000..47a2db2f --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/info.yaml @@ -0,0 +1,37 @@ +name: MediaPipe-Face-Detection +# id must match with the model dir name in qai_hub_models +id: mediapipe_face +status: public +headline: Detect faces and locate facial features in real-time video and image streams. +domain: Computer Vision +description: Designed for sub-millisecond processing, this model predicts + bounding boxes and pose skeletons (left eye, right eye, nose tip, mouth, left + eye tragion, and right eye tragion) of faces in an image. +use_case: Object Detection +tags: + - real-time +research_paper: https://arxiv.org/abs/1907.05047 +research_paper_title: "BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs" +license: https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE +source_repo: https://github.com/zmurez/MediaPipePyTorch/ +technical_details: + Encoder Number of parameters: 136K + Decoder Number of parameters: 605K + Model size: 1.3 MB + Input resolution: 256x256 +applicable_scenarios: + - Accessibility + - Augmented Reality + - Gaming +related_models: + - 'mediapipe_hand' + - 'mediapipe_pose' + - 'mediapipe_selfie' +form_factors: + - Phone + - Tablet + - IoT +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: [] diff --git a/qai_hub_models/models/mediapipe_face/model.py b/qai_hub_models/models/mediapipe_face/model.py new file mode 100644 index 00000000..bc72c214 --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/model.py @@ -0,0 +1,278 @@ +from __future__ import annotations + +from typing import Callable, Tuple + +import torch + +from qai_hub_models.models._shared.mediapipe.utils import MediaPipePyTorchAsRoot +from qai_hub_models.utils.base_model import BaseModel, CollectionModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 + +# Vertex indices can be found in +# https://github.com/google/mediapipe/blob/0.8.1/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualization.png +# Found in https://github.com/google/mediapipe/blob/v0.10.3/mediapipe/python/solutions/face_mesh.py +FACE_LANDMARK_CONNECTIONS = [ + # Lips. + (61, 146), + (146, 91), + (91, 181), + (181, 84), + (84, 17), + (17, 314), + (314, 405), + (405, 321), + (321, 375), + (375, 291), + (61, 185), + (185, 40), + (40, 39), + (39, 37), + (37, 0), + (0, 267), + (267, 269), + (269, 270), + (270, 409), + (409, 291), + (78, 95), + (95, 88), + (88, 178), + (178, 87), + (87, 14), + (14, 317), + (317, 402), + (402, 318), + (318, 324), + (324, 308), + (78, 191), + (191, 80), + (80, 81), + (81, 82), + (82, 13), + (13, 312), + (312, 311), + (311, 310), + (310, 415), + (415, 308), + # Left eye. + (263, 249), + (249, 390), + (390, 373), + (373, 374), + (374, 380), + (380, 381), + (381, 382), + (382, 362), + (263, 466), + (466, 388), + (388, 387), + (387, 386), + (386, 385), + (385, 384), + (384, 398), + (398, 362), + # Left eyebrow. + (276, 283), + (283, 282), + (282, 295), + (295, 285), + (300, 293), + (293, 334), + (334, 296), + (296, 336), + # Right eye. + (33, 7), + (7, 163), + (163, 144), + (144, 145), + (145, 153), + (153, 154), + (154, 155), + (155, 133), + (33, 246), + (246, 161), + (161, 160), + (160, 159), + (159, 158), + (158, 157), + (157, 173), + (173, 133), + # Right eyebrow. + (46, 53), + (53, 52), + (52, 65), + (65, 55), + (70, 63), + (63, 105), + (105, 66), + (66, 107), + # Face oval. + (10, 338), + (338, 297), + (297, 332), + (332, 284), + (284, 251), + (251, 389), + (389, 356), + (356, 454), + (454, 323), + (323, 361), + (361, 288), + (288, 397), + (397, 365), + (365, 379), + (379, 378), + (378, 400), + (400, 377), + (377, 152), + (152, 148), + (148, 176), + (176, 149), + (149, 150), + (150, 136), + (136, 172), + (172, 58), + (58, 132), + (132, 93), + (93, 234), + (234, 127), + (127, 162), + (162, 21), + (21, 54), + (54, 103), + (103, 67), + (67, 109), + (109, 10), +] + + +# Face detector model parameters. +BATCH_SIZE = 1 +DETECT_SCORE_SLIPPING_THRESHOLD = 100 # Clip output scores to this maximum value. +DETECT_DXY, DETECT_DSCALE = ( + 0, + 1.1, +) # Modifiers applied to face detector output bounding box to encapsulate the entire face. +LEFT_EYE_KEYPOINT_INDEX = 0 # The face detector outputs several keypoints. This is the keypoint index for the left eye. +RIGHT_EYE_KEYPOINT_INDEX = 1 # The face detector outputs several keypoints. This is the keypoint index for the right eye. +ROTATION_VECTOR_OFFSET_RADS = ( + 0 # Offset required when computing rotation of the detected face. +) + + +class MediaPipeFace(CollectionModel): + def __init__( + self, + face_detector: FaceDetector, + face_landmark_detector: FaceLandmarkDetector, + ) -> None: + """ + Construct a mediapipe face model. + + Inputs: + face_detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]] + Face detection model. Input is an image, output is + [bounding boxes & keypoints, box & kp scores] + + face_landmark_detector + Face landmark detector model. Input is an image cropped to the face. The face must be upright + and un-tilted in the frame. Returns [landmark_scores, landmarks] + """ + super().__init__() + self.face_detector = face_detector + self.face_landmark_detector = face_landmark_detector + + @classmethod + def from_pretrained( + cls, + detector_weights: str = "blazefaceback.pth", + detector_anchors: str = "anchors_face_back.npy", + landmark_detector_weights: str = "blazeface_landmark.pth", + ) -> MediaPipeFace: + """ + Load mediapipe models from the source repository. + Returns tuple[ + .blazeface.BlazeFace, + BlazeFace Anchors, + .blazeface_landmark.BlazeFaceLandmark, + ] + """ + with MediaPipePyTorchAsRoot(): + from blazeface import BlazeFace + from blazeface_landmark import BlazeFaceLandmark + + face_detector = BlazeFace(back_model=True) + face_detector.load_weights(detector_weights) + face_detector.load_anchors(detector_anchors) + face_regressor = BlazeFaceLandmark() + face_regressor.load_weights(landmark_detector_weights) + + return cls( + FaceDetector(face_detector, face_detector.anchors), + FaceLandmarkDetector(face_regressor), + ) + + +class FaceDetector(BaseModel): + def __init__( + self, + detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]], + anchors: torch.Tensor, + ): + super().__init__() + self.detector = detector + self.anchors = anchors + + def forward(self, image: torch.Tensor): + return self.detector(image) + + @classmethod + def from_pretrained( + cls, + detector_weights: str = "blazefaceback.pth", + detector_anchors: str = "anchors_face_back.npy", + ): + with MediaPipePyTorchAsRoot(): + from blazeface import BlazeFace + + face_detector = BlazeFace(back_model=True) + face_detector.load_weights(detector_weights) + face_detector.load_anchors(detector_anchors) + return cls(face_detector, face_detector.anchors) + + def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + """ + Returns the input specification (name -> (shape, type) of the face detector. + This can be used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, 3, 256, 256), "float32")} + + +class FaceLandmarkDetector(BaseModel): + def __init__( + self, + detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]], + ): + super().__init__() + self.detector = detector + + def forward(self, image: torch.Tensor): + return self.detector(image) + + @classmethod + def from_pretrained(cls, landmark_detector_weights: str = "blazeface_landmark.pth"): + with MediaPipePyTorchAsRoot(): + from blazeface_landmark import BlazeFaceLandmark + + face_regressor = BlazeFaceLandmark() + face_regressor.load_weights(landmark_detector_weights) + return cls(face_regressor) + + def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + """ + Returns the input specification (name -> (shape, type) of the face landmark detector. + This can be used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, 3, 192, 192), "float32")} diff --git a/qai_hub_models/models/mediapipe_face/perf.yaml b/qai_hub_models/models/mediapipe_face/perf.yaml new file mode 100644 index 00000000..3df1ee28 --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/perf.yaml @@ -0,0 +1,107 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: MediaPipeFaceDetector + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 536.0 + throughput: 1865.6716417910447 + estimated_peak_memory_range: + min: 12288 + max: 1539856 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 111 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 111 + job_id: jqp4ydjqp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 592.0 + throughput: 1689.1891891891892 + estimated_peak_memory_range: + min: 802816 + max: 57565728 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 147 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 147 + job_id: jo5m06vyg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:08:54.792595Z' +- name: MediaPipeFaceLandmarkDetector + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 209.0 + throughput: 4784.688995215311 + estimated_peak_memory_range: + min: 24576 + max: 1806472 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 100 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 100 + job_id: j0pxl6ejp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 286.0 + throughput: 3496.5034965034965 + estimated_peak_memory_range: + min: 462848 + max: 8766648 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 106 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 106 + job_id: jegnzmxvg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:12:20.881454Z' diff --git a/qai_hub_models/models/mediapipe_face/requirements.txt b/qai_hub_models/models/mediapipe_face/requirements.txt new file mode 100644 index 00000000..9c11ddeb --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/requirements.txt @@ -0,0 +1,2 @@ +opencv-python +requests diff --git a/qai_hub_models/models/mediapipe_face/test.py b/qai_hub_models/models/mediapipe_face/test.py new file mode 100644 index 00000000..72e8a977 --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/test.py @@ -0,0 +1,37 @@ +import numpy as np + +from qai_hub_models.models.mediapipe_face.app import MediaPipeFaceApp +from qai_hub_models.models.mediapipe_face.demo import INPUT_IMAGE_ADDRESS +from qai_hub_models.models.mediapipe_face.demo import main as demo_main +from qai_hub_models.models.mediapipe_face.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + MediaPipeFace, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "face_output.png" +) + + +# Because we have not made a modification to the pytorch source network, +# no numerical tests are included for the model; only for the app. +@skip_clone_repo_check +def test_face_app(): + input = load_image( + INPUT_IMAGE_ADDRESS, + ) + expected_output = load_image( + OUTPUT_IMAGE_ADDRESS, + ).convert("RGB") + app = MediaPipeFaceApp(MediaPipeFace.from_pretrained()) + assert np.allclose( + app.predict_landmarks_from_image(input)[0], np.asarray(expected_output) + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/mediapipe_hand/README.md b/qai_hub_models/models/mediapipe_hand/README.md new file mode 100644 index 00000000..7ad1da5e --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [MediaPipe-Hand-Detection: Real-time hand detection optimized for mobile and edge](https://aihub.qualcomm.com/models/mediapipe_hand) + +The MediaPipe Hand Landmark Detector is a machine learning pipeline that predicts bounding boxes and pose skeletons of hands in an image. + +This is based on the implementation of MediaPipe-Hand-Detection found +[here](https://github.com/zmurez/MediaPipePyTorch/). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/mediapipe_hand). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[mediapipe_hand]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.mediapipe_hand.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.mediapipe_hand.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of MediaPipe-Hand-Detection can be found + [here](https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE). + + +## References +* [MediaPipe Hands: On-device Real-time Hand Tracking](https://arxiv.org/abs/2006.10214) +* [Source Model Implementation](https://github.com/zmurez/MediaPipePyTorch/) diff --git a/qai_hub_models/models/mediapipe_hand/__init__.py b/qai_hub_models/models/mediapipe_hand/__init__.py new file mode 100644 index 00000000..1d5c430b --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/__init__.py @@ -0,0 +1,3 @@ +from .app import MediaPipeHandApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import MediaPipeHand as Model # noqa: F401 diff --git a/qai_hub_models/models/mediapipe_hand/app.py b/qai_hub_models/models/mediapipe_hand/app.py new file mode 100644 index 00000000..86d6f1fa --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/app.py @@ -0,0 +1,244 @@ +from __future__ import annotations + +from typing import List, Tuple + +import cv2 +import numpy as np +import torch +from PIL.Image import Image + +from qai_hub_models.models._shared.mediapipe.app import MediaPipeApp +from qai_hub_models.models.mediapipe_hand.model import ( + DETECT_DSCALE, + DETECT_DXY, + DETECT_SCORE_SLIPPING_THRESHOLD, + HAND_LANDMARK_CONNECTIONS, + MIDDLE_FINDER_KEYPOINT_INDEX, + ROTATION_VECTOR_OFFSET_RADS, + WRIST_CENTER_KEYPOINT_INDEX, + MediaPipeHand, +) +from qai_hub_models.utils.bounding_box_processing import ( + compute_box_affine_crop_resize_matrix, +) +from qai_hub_models.utils.draw import draw_connections, draw_points +from qai_hub_models.utils.image_processing import ( + apply_affine_to_coordinates, + apply_batched_affines_to_frame, + numpy_image_to_torch, +) + + +class MediaPipeHandApp(MediaPipeApp): + """ + This class consists of light-weight "app code" that is required to perform end to end inference with MediaPipe's hand landmark detector. + + The app uses 2 models: + * MediaPipeHandDetector + * MediaPipeHandLandmark + + See the class comment for the parent class for details. + """ + + def __init__( + self, + model: MediaPipeHand, + min_detector_hand_box_score: float = 0.95, + nms_iou_threshold: float = 0.3, + min_landmark_score: float = 0.5, + ): + """ + Construct a mediapipe hand application. + + Inputs: + model: MediaPipeHand model + Hand detection & landmark model container. + + See parent initializer for further parameter documentation. + """ + super().__init__( + model.hand_detector, + model.hand_detector.anchors, + model.hand_landmark_detector, + model.hand_detector.get_input_spec()["image"][0][-2:], + model.hand_landmark_detector.get_input_spec()["image"][0][-2:], + WRIST_CENTER_KEYPOINT_INDEX, + MIDDLE_FINDER_KEYPOINT_INDEX, + ROTATION_VECTOR_OFFSET_RADS, + DETECT_DXY, + DETECT_DSCALE, + min_detector_hand_box_score, + DETECT_SCORE_SLIPPING_THRESHOLD, + nms_iou_threshold, + min_landmark_score, + HAND_LANDMARK_CONNECTIONS, + ) + + def predict_landmarks_from_image( + self, + pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], + raw_output: bool = False, + ) -> Tuple[ + List[torch.Tensor | None], + List[torch.Tensor | None], + List[torch.Tensor | None], + List[List[bool] | None], + ] | List[np.ndarray]: + """ + From the provided image or tensor, predict the bounding boxes & classes of the hand detected within. + + Parameters: + See parent function documentation. + + Returns: + See parent function documentation for generic return values. + + If raw_output is false, returns an additional output: + + batched_is_right_hand: List[List[bool] | None]] + Whether each landmark represents a right (True) or left (False) hand. + Organized like the following: + [ + # Batch 0 (for Input Image 0) + [ + True (for Selected Landmark 1) + False (Selected Landmark 2) + ... + ] + # Batch 1 (for Input Image 1) + None # (this image has no detected palm) + ... + ] + """ + return super().predict_landmarks_from_image(pixel_values_or_image, raw_output) # type: ignore + + def _draw_predictions( + self, + NHWC_int_numpy_frames: List[np.ndarray], + batched_selected_boxes: List[torch.Tensor | None], + batched_selected_keypoints: List[torch.Tensor | None], + batched_roi_4corners: List[torch.Tensor | None], + batched_selected_landmarks: List[torch.Tensor | None], + batched_is_right_hand: List[List[bool] | None], + ): + """ + Override of mediapipe::app.py::MediaPipeApp::draw_outputs + Also draws whether the detection is a right or left hand. + + Additional inputs: + batched_is_right_hand: List[List[bool] | None] + True if the detection is a right hand, false if it's a left hand. None if no hand detected. + """ + for batch_idx in range(len(NHWC_int_numpy_frames)): + image = NHWC_int_numpy_frames[batch_idx] + ld = batched_selected_landmarks[batch_idx] + box = batched_selected_boxes[batch_idx] + kp = batched_selected_keypoints[batch_idx] + roi_4corners = batched_roi_4corners[batch_idx] + irh = batched_is_right_hand[batch_idx] + + if box is not None and kp is not None and roi_4corners is not None: + self._draw_box_and_roi(image, box, kp, roi_4corners) + if ld is not None and irh is not None: + self._draw_landmarks(image, ld, irh) + + def _draw_landmarks( + self, + NHWC_int_numpy_frame: np.ndarray, + landmarks: torch.Tensor, + is_right_hand: List[bool], + ): + """ + Override of mediapipe::app.py::MediaPipeApp::draw_landmarks + Also draws whether the detection is a right or left hand. + """ + for ldm, irh in zip(landmarks, is_right_hand): + # Draw landmark points + draw_points(NHWC_int_numpy_frame, ldm[:, :2], (0, 255, 0)) + # Draw connections between landmark points + if self.landmark_connections: + draw_connections( + NHWC_int_numpy_frame, + ldm[:, :2], + self.landmark_connections, + (255 if irh else 0, 0, 0 if irh else 255), + 2, + ) + + def _run_landmark_detector( + self, + NHWC_int_numpy_frames: List[np.ndarray], + batched_roi_4corners: List[torch.Tensor | None], + ) -> Tuple[List[torch.Tensor | None], List[List[bool] | None]]: + """ + Override of mediapipe::app.py::MediaPipeApp::run_landmark_detector + Additionally returns whether the detection is a right or left hand. + """ + + # selected landmarks for the ROI (if any) + # List[torch.Tensor(shape=[Num Selected Landmarks, K, 3])], + # where K == number of landmark keypoints, 3 == (x, y, p) + # + # A list element will be None if there is no ROI. + batched_selected_landmarks: List[torch.Tensor | None] = [] + + # whether the selected landmarks for the ROI (if applicable) are for a left or right hand + # + # A list element will be None if there is no ROI. + batched_is_right_hand: List[List[bool] | None] = [] + + # For each input image... + for batch_idx, roi_4corners in enumerate(batched_roi_4corners): + if roi_4corners is None: + continue + affines = compute_box_affine_crop_resize_matrix( + roi_4corners[:, :3], self.landmark_input_dims + ) + + # Create input images by applying the affine transforms. + keypoint_net_inputs = numpy_image_to_torch( + apply_batched_affines_to_frame( + NHWC_int_numpy_frames[batch_idx], affines, self.landmark_input_dims + ) + ) + + # Compute hand landmarks. + ld_scores, lr, landmarks = self.landmark_detector( # type: ignore + keypoint_net_inputs + ) + + # Convert [0-1] ranged values of landmarks to integer pixel space. + landmarks[:, :, 0] *= self.landmark_input_dims[0] + landmarks[:, :, 1] *= self.landmark_input_dims[1] + + # 1 landmark is predicted for each ROI of each input image. + # For each region of interest & associated predicted landmarks... + all_landmarks = [] + all_lr = [] + for ld_batch_idx in range(landmarks.shape[0]): + # Exclude landmarks that don't meet the appropriate score threshold. + if ld_scores[ld_batch_idx] >= self.min_detector_box_score: + # Apply the inverse of affine transform used above to the landmark coordinates. + # This will convert the coordinates to their locations in the original input image. + inverted_affine = torch.from_numpy( + cv2.invertAffineTransform(affines[ld_batch_idx]) + ).float() + landmarks[ld_batch_idx][:, :2] = apply_affine_to_coordinates( + landmarks[ld_batch_idx][:, :2], inverted_affine + ) + + # Add the predicted landmarks to our list. + all_landmarks.append(landmarks[ld_batch_idx]) + all_lr.append(torch.round(lr[ld_batch_idx]).item() == 1) + + # Add this batch of landmarks to the output list. + batched_selected_landmarks.append( + torch.stack(all_landmarks, dim=0) if all_landmarks else None + ) + batched_is_right_hand.append(all_lr) + else: + # Add None for these lists, since this batch has no predicted bounding boxes. + batched_selected_landmarks.append(None) + batched_is_right_hand.append(None) + + return (batched_selected_landmarks, batched_is_right_hand) diff --git a/qai_hub_models/models/mediapipe_hand/demo.py b/qai_hub_models/models/mediapipe_hand/demo.py new file mode 100644 index 00000000..730ea277 --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/demo.py @@ -0,0 +1,85 @@ +import argparse + +import numpy as np +from PIL import Image + +from qai_hub_models.models.mediapipe_hand.app import MediaPipeHandApp +from qai_hub_models.models.mediapipe_hand.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + MediaPipeHand, +) +from qai_hub_models.utils.args import add_output_dir_arg +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.camera_capture import capture_and_display_processed_frames +from qai_hub_models.utils.display import display_or_save_image + +INPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "hand.jpeg" +) + + +# Run Mediapipe Hand landmark detection end-to-end on a sample image or camera stream. +# The demo will display output with the predicted landmarks & bounding boxes drawn. +def main(is_test: bool = False): + # Demo parameters + parser = argparse.ArgumentParser() + parser.add_argument( + "--image", + type=str, + required=False, + help="image file path or URL", + ) + parser.add_argument( + "--camera", + type=int, + default=0, + help="Camera Input ID", + ) + parser.add_argument( + "--score-threshold", + type=float, + default=0.95, + help="Score threshold for NonMaximumSuppression", + ) + parser.add_argument( + "--iou-threshold", + type=float, + default=0.3, + help="Intersection over Union (IoU) threshold for NonMaximumSuppression", + ) + add_output_dir_arg(parser) + + print( + "Note: This readme is running through torch, and not meant to be real-time without dedicated ML hardware." + ) + print("Use Ctrl+C in your terminal to exit.") + + args = parser.parse_args([] if is_test else None) + if is_test: + args.image = INPUT_IMAGE_ADDRESS + + # Load app + app = MediaPipeHandApp( + MediaPipeHand.from_pretrained(), args.score_threshold, args.iou_threshold + ) + print("Model and App Loaded") + + if args.image: + image = load_image(args.image) + pred_image = app.predict_landmarks_from_image(image) + out_image = Image.fromarray(pred_image[0], "RGB") + if not is_test: + display_or_save_image(out_image, args.output_dir) + else: + + def frame_processor(frame: np.ndarray) -> np.ndarray: + return app.predict_landmarks_from_image(frame)[0] # type: ignore + + capture_and_display_processed_frames( + frame_processor, "QAIHM Mediapipe Hand Demo", args.camera + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mediapipe_hand/export.py b/qai_hub_models/models/mediapipe_hand/export.py new file mode 100644 index 00000000..e05b7bf9 --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/export.py @@ -0,0 +1,215 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Mapping, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.mediapipe_hand import Model +from qai_hub_models.utils.args import ( + export_parser, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + +ALL_COMPONENTS = ["MediaPipeHandDetector", "MediaPipeHandLandmarkDetector"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[ + str, Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] +] | List[str]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` + + Returns: + A Mapping from component_name to a 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "mediapipe_hand" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + component_arg = components + components = components or ALL_COMPONENTS + for component in components: + if component not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "mediapipe_hand", + "MediaPipe-Hand-Detection", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + component_arg, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + components_dict = {} + if "MediaPipeHandDetector" in components: + components_dict["MediaPipeHandDetector"] = model.hand_detector + if "MediaPipeHandLandmarkDetector" in components: + components_dict["MediaPipeHandLandmarkDetector"] = model.hand_landmark_detector + + compile_jobs = {} + for component_name, component in components_dict.items(): + # Trace the model + input_spec = component.get_input_spec() + source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + + # 2. Compile the models to an on-device asset + model_compile_options = component.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {component_name} to run on-device.") + compile_jobs[component_name] = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=f"{component_name}", + options=model_compile_options, + ) + + # 3. Profile the model assets on real devices + profile_jobs = {} + if not skip_profiling: + for component_name in components: + print(f"Profiling model {component_name} on a hosted device.") + profile_jobs[component_name] = hub.submit_profile_job( + model=compile_jobs[component_name].get_target_model(), + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_jobs = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + sample_inputs = components_dict[component_name].sample_inputs() + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_jobs[component_name] = hub.submit_inference_job( + model=compile_jobs[component_name].get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 5. Download the model assets to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + for component_name, compile_job in compile_jobs.items(): + target_model = compile_job.get_target_model() + target_model.download( + str(output_path / f"{model_name}_{component_name}.tflite") + ) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + for component_name in components: + inference_job = inference_jobs[component_name] + sample_inputs = components_dict[component_name].sample_inputs() + torch_out = torch_inference(components_dict[component_name], sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return { + component_name: ( + compile_jobs[component_name], + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mediapipe_hand/info.yaml b/qai_hub_models/models/mediapipe_hand/info.yaml new file mode 100644 index 00000000..5cbb7642 --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/info.yaml @@ -0,0 +1,35 @@ +name: MediaPipe-Hand-Detection +# id must match with the model dir name in qai_hub_models +id: mediapipe_hand +status: public +headline: Real-time hand detection optimized for mobile and edge. +domain: Computer Vision +description: The MediaPipe Hand Landmark Detector is a machine learning pipeline that predicts bounding boxes and pose skeletons of hands in an image. +use_case: Object Detection +tags: + - real-time +research_paper: https://arxiv.org/abs/2006.10214 +research_paper_title: "MediaPipe Hands: On-device Real-time Hand Tracking" +license: https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE +source_repo: https://github.com/zmurez/MediaPipePyTorch/ +technical_details: + Encoder Number of parameters: 1.76M + Decoder Number of parameters: 2.01M + Model size: 16 MB + Input resolution: 256x256 +applicable_scenarios: + - Gesture Control + - Virtual Reality + - Gaming +related_models: + - 'mediapipe_face' + - 'mediapipe_pose' + - 'mediapipe_selfie' +form_factors: + - Phone + - Tablet + - IoT +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: [] diff --git a/qai_hub_models/models/mediapipe_hand/model.py b/qai_hub_models/models/mediapipe_hand/model.py new file mode 100644 index 00000000..09e67ed1 --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/model.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +from typing import Callable, Tuple + +import numpy as np +import torch + +from qai_hub_models.models._shared.mediapipe.utils import MediaPipePyTorchAsRoot +from qai_hub_models.utils.base_model import BaseModel, CollectionModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 + +# https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py +# 8 12 16 20 +# | | | | +# 7 11 15 19 +# 4 | | | | +# | 6 10 14 18 +# 3 | | | | +# | 5---9---13--17 +# 2 \ / +# \ \ / +# 1 \ / +# \ \ / +# ------0- +HAND_LANDMARK_CONNECTIONS = ( + [ # Landmark model will output 18 points. They map to the points above. + (0, 1), + (1, 2), + (2, 3), + (3, 4), + (5, 6), + (6, 7), + (7, 8), + (9, 10), + (10, 11), + (11, 12), + (13, 14), + (14, 15), + (15, 16), + (17, 18), + (18, 19), + (19, 20), + (0, 5), + (5, 9), + (9, 13), + (13, 17), + (0, 17), + ] +) + +# Palm detector model parameters. +BATCH_SIZE = 1 +DETECT_SCORE_SLIPPING_THRESHOLD = 100 # Clip output scores to this maximum value. +DETECT_DXY, DETECT_DSCALE = ( + 0.5, + 2.5, +) # Modifiers applied to palm detector output bounding box to encapsulate the entire hand. +WRIST_CENTER_KEYPOINT_INDEX = 0 # The palm detector outputs several keypoints. This is the keypoint index for the wrist center. +MIDDLE_FINDER_KEYPOINT_INDEX = 2 # The palm detector outputs several keypoints. This is the keypoint index for the bottom of the middle finger. +ROTATION_VECTOR_OFFSET_RADS = ( + np.pi / 2 +) # Offset required when computing rotation of the detected palm. + + +class MediaPipeHand(CollectionModel): + def __init__( + self, + hand_detector: HandDetector, + hand_landmark_detector: HandLandmarkDetector, + ) -> None: + """ + Construct a mediapipe hand model. + + Inputs: + hand_detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]] + Hand detection model. Input is an image, output is + [bounding boxes & keypoints, box & keypoint scores] + + hand_landmark_detector + Hand landmark detector model. Input is an image cropped to the hand. The hand must be upright + and un-tilted in the frame. Returns [landmark_scores, prob_is_right_hand, landmarks] + """ + super().__init__() + self.hand_detector = hand_detector + self.hand_landmark_detector = hand_landmark_detector + + @classmethod + def from_pretrained( + cls, + detector_weights: str = "blazepalm.pth", + detector_anchors: str = "anchors_palm.npy", + landmark_detector_weights: str = "blazehand_landmark.pth", + ) -> MediaPipeHand: + with MediaPipePyTorchAsRoot(): + from blazehand_landmark import BlazeHandLandmark + from blazepalm import BlazePalm + + palm_detector = BlazePalm() + palm_detector.load_weights(detector_weights) + palm_detector.load_anchors(detector_anchors) + palm_detector.min_score_thresh = 0.75 + hand_regressor = BlazeHandLandmark() + hand_regressor.load_weights(landmark_detector_weights) + + return cls( + HandDetector(palm_detector, palm_detector.anchors), + HandLandmarkDetector(hand_regressor), + ) + + +class HandDetector(BaseModel): + def __init__( + self, + detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]], + anchors: torch.Tensor, + ): + super().__init__() + self.detector = detector + self.anchors = anchors + + def forward(self, image: torch.Tensor): + return self.detector(image) + + @classmethod + def from_pretrained( + cls, + detector_weights: str = "blazepalm.pth", + detector_anchors: str = "anchors_palm.npy", + ): + with MediaPipePyTorchAsRoot(): + from blazepalm import BlazePalm + + hand_detector = BlazePalm(back_model=True) + hand_detector.load_weights(detector_weights) + hand_detector.load_anchors(detector_anchors) + return cls(hand_detector, hand_detector.anchors) + + def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + """ + Returns the input specification (name -> (shape, type) of the hand detector. + This can be used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, 3, 256, 256), "float32")} + + +class HandLandmarkDetector(BaseModel): + def __init__( + self, + detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]], + ): + super().__init__() + self.detector = detector + + def forward(self, image: torch.Tensor): + return self.detector(image) + + @classmethod + def from_pretrained(cls, landmark_detector_weights: str = "blazehand_landmark.pth"): + with MediaPipePyTorchAsRoot(): + from blazehand_landmark import BlazeHandLandmark + + hand_regressor = BlazeHandLandmark() + hand_regressor.load_weights(landmark_detector_weights) + cls(hand_regressor) + + def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + """ + Returns the input specification (name -> (shape, type) of the hand landmark detector. + This can be used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, 3, 256, 256), "float32")} diff --git a/qai_hub_models/models/mediapipe_hand/perf.yaml b/qai_hub_models/models/mediapipe_hand/perf.yaml new file mode 100644 index 00000000..f79bb0af --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/perf.yaml @@ -0,0 +1,107 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: MediaPipeHandDetector + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 762.0 + throughput: 1312.3359580052493 + estimated_peak_memory_range: + min: 12288 + max: 3281536 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 151 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 151 + job_id: jwgolne4g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 820.0 + throughput: 1219.5121951219512 + estimated_peak_memory_range: + min: 806912 + max: 6264240 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 196 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 196 + job_id: j7gjr2k7p + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:08:53.710000Z' +- name: MediaPipeHandLandmarkDetector + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1017.0 + throughput: 983.284169124877 + estimated_peak_memory_range: + min: 24576 + max: 2409872 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 158 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 158 + job_id: j1pvlrz75 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1088.0 + throughput: 919.1176470588235 + estimated_peak_memory_range: + min: 577536 + max: 53567440 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 209 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 209 + job_id: jlpe7w475 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:12:22.243551Z' diff --git a/qai_hub_models/models/mediapipe_hand/requirements.txt b/qai_hub_models/models/mediapipe_hand/requirements.txt new file mode 100644 index 00000000..9c11ddeb --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/requirements.txt @@ -0,0 +1,2 @@ +opencv-python +requests diff --git a/qai_hub_models/models/mediapipe_hand/test.py b/qai_hub_models/models/mediapipe_hand/test.py new file mode 100644 index 00000000..df75807f --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/test.py @@ -0,0 +1,38 @@ +import numpy as np + +from qai_hub_models.models.mediapipe_hand.app import MediaPipeHandApp +from qai_hub_models.models.mediapipe_hand.demo import INPUT_IMAGE_ADDRESS +from qai_hub_models.models.mediapipe_hand.demo import main as demo_main +from qai_hub_models.models.mediapipe_hand.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + MediaPipeHand, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "hand_output.png" +) + +# Because we have not made a modification to the pytorch source network, +# no numerical tests are included for the model; only for the app. + + +@skip_clone_repo_check +def test_hand_app(): + input = load_image( + INPUT_IMAGE_ADDRESS, + ) + expected_output = load_image( + OUTPUT_IMAGE_ADDRESS, + ).convert("RGB") + app = MediaPipeHandApp(MediaPipeHand.from_pretrained()) + assert np.allclose( + app.predict_landmarks_from_image(input)[0], np.asarray(expected_output) + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/mediapipe_pose/README.md b/qai_hub_models/models/mediapipe_pose/README.md new file mode 100644 index 00000000..19eb21a0 --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [MediaPipe-Pose-Estimation: Detect and track human body poses in real-time images and video streams](https://aihub.qualcomm.com/models/mediapipe_pose) + +The MediaPipe Pose Landmark Detector is a machine learning pipeline that predicts bounding boxes and pose skeletons of poses in an image. + +This is based on the implementation of MediaPipe-Pose-Estimation found +[here](https://github.com/zmurez/MediaPipePyTorch/). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/mediapipe_pose). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[mediapipe_pose]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.mediapipe_pose.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.mediapipe_pose.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of MediaPipe-Pose-Estimation can be found + [here](https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE). + + +## References +* [BlazePose: On-device Real-time Body Pose tracking](https://arxiv.org/abs/2006.10204) +* [Source Model Implementation](https://github.com/zmurez/MediaPipePyTorch/) diff --git a/qai_hub_models/models/mediapipe_pose/__init__.py b/qai_hub_models/models/mediapipe_pose/__init__.py new file mode 100644 index 00000000..392b3f93 --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/__init__.py @@ -0,0 +1,3 @@ +from .app import MediaPipePoseApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import MediaPipePose as Model # noqa: F401 diff --git a/qai_hub_models/models/mediapipe_pose/app.py b/qai_hub_models/models/mediapipe_pose/app.py new file mode 100644 index 00000000..0aca8d0e --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/app.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +from typing import List, Tuple + +import torch + +from qai_hub_models.models._shared.mediapipe.app import MediaPipeApp +from qai_hub_models.models.mediapipe_pose.model import ( + DETECT_DSCALE, + DETECT_DXY, + DETECT_SCORE_SLIPPING_THRESHOLD, + POSE_KEYPOINT_INDEX_END, + POSE_KEYPOINT_INDEX_START, + POSE_LANDMARK_CONNECTIONS, + ROTATION_VECTOR_OFFSET_RADS, + MediaPipePose, +) +from qai_hub_models.utils.bounding_box_processing import ( + compute_box_corners_with_rotation, +) +from qai_hub_models.utils.image_processing import compute_vector_rotation + + +class MediaPipePoseApp(MediaPipeApp): + """ + This class consists of light-weight "app code" that is required to perform end to end inference with MediaPipe's pose landmark detector. + + The app uses 2 models: + * MediaPipePoseDetector + * MediaPipePoseLandmark + + See the class comment for the parent class for details. + """ + + def __init__( + self, + model: MediaPipePose, + min_detector_pose_box_score: float = 0.75, + nms_iou_threshold: float = 0.3, + min_landmark_score: float = 0.5, + ): + """ + Construct a mediapipe pose application. + + Inputs: + model: MediaPipePose model + Pose detection & landmark model container. + + See parent initializer for further parameter documentation. + """ + + def _landmark_detector_ignore_third_output( + x: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + The Last landmark detector output ("mask") is not used by the demo application. + Wrap the detector in a function that discards the mask. + """ + out0, out1, _ = model.pose_landmark_detector(x) + return out0, out1 + + super().__init__( + model.pose_detector, + model.pose_detector.anchors, + _landmark_detector_ignore_third_output, + model.pose_detector.get_input_spec()["image"][0][-2:], + model.pose_landmark_detector.get_input_spec()["image"][0][-2:], + POSE_KEYPOINT_INDEX_START, + POSE_KEYPOINT_INDEX_END, + ROTATION_VECTOR_OFFSET_RADS, + DETECT_DXY, + DETECT_DSCALE, + min_detector_pose_box_score, + DETECT_SCORE_SLIPPING_THRESHOLD, + nms_iou_threshold, + min_landmark_score, + POSE_LANDMARK_CONNECTIONS, + ) + + def _compute_object_roi( + self, + batched_selected_boxes: List[torch.Tensor | None], + batched_selected_keypoints: List[torch.Tensor | None], + ) -> List[torch.Tensor | None]: + """ + See parent function for base functionality and parameter documentation. + + The MediaPipe pose pipeline computes the ROI not from the detector bounding box, + but from specific detected keypoints. This override implements that behavior. + """ + batched_selected_roi = [] + for boxes, keypoints in zip(batched_selected_boxes, batched_selected_keypoints): + if boxes is None or keypoints is None: + batched_selected_roi.append(None) + continue + + # Compute bounding box center and rotation + theta = compute_vector_rotation( + keypoints[:, self.keypoint_rotation_vec_start_idx, ...], + keypoints[:, self.keypoint_rotation_vec_end_idx, ...], + self.rotation_offset_rads, + ) + xc = keypoints[..., self.keypoint_rotation_vec_start_idx, 0] + yc = keypoints[..., self.keypoint_rotation_vec_start_idx, 1] + x1 = keypoints[..., self.keypoint_rotation_vec_end_idx, 0] + y1 = keypoints[..., self.keypoint_rotation_vec_end_idx, 1] + + # Square box always + w = ((xc - x1) ** 2 + (yc - y1) ** 2).sqrt() * 2 * self.detect_box_scale + h = w + + # Compute box corners from box center, width, height + batched_selected_roi.append( + compute_box_corners_with_rotation(xc, yc, w, h, theta) + ) + + return batched_selected_roi diff --git a/qai_hub_models/models/mediapipe_pose/demo.py b/qai_hub_models/models/mediapipe_pose/demo.py new file mode 100644 index 00000000..355c29de --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/demo.py @@ -0,0 +1,85 @@ +import argparse + +import numpy as np +from PIL import Image + +from qai_hub_models.models.mediapipe_pose.app import MediaPipePoseApp +from qai_hub_models.models.mediapipe_pose.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + MediaPipePose, +) +from qai_hub_models.utils.args import add_output_dir_arg +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.camera_capture import capture_and_display_processed_frames +from qai_hub_models.utils.display import display_or_save_image + +INPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "pose.jpeg" +) + + +# Run Mediapipe Pose landmark detection end-to-end on a sample image or camera stream. +# The demo will display output with the predicted landmarks & bounding boxes drawn. +def main(is_test: bool = False): + # Demo parameters + parser = argparse.ArgumentParser() + parser.add_argument( + "--image", + type=str, + required=False, + help="image file path or URL. Image spatial dimensions (x and y) must be multiples", + ) + add_output_dir_arg(parser) + parser.add_argument( + "--camera", + type=int, + default=0, + help="Camera Input ID", + ) + parser.add_argument( + "--score-threshold", + type=float, + default=0.75, + help="Score threshold for NonMaximumSuppression", + ) + parser.add_argument( + "--iou-threshold", + type=float, + default=0.3, + help="Intersection over Union (IoU) threshold for NonMaximumSuppression", + ) + + args = parser.parse_args([] if is_test else None) + if is_test: + args.image = INPUT_IMAGE_ADDRESS + + print( + "Note: This readme is running through torch, and not meant to be real-time without dedicated ML hardware." + ) + print("Use Ctrl+C in your terminal to exit.") + + # Load app + app = MediaPipePoseApp( + MediaPipePose.from_pretrained(), args.score_threshold, args.iou_threshold + ) + print("Model and App Loaded") + + if args.image: + image = load_image(args.image).convert("RGB") + pred_image = app.predict_landmarks_from_image(image) + out = Image.fromarray(pred_image[0], "RGB") + if not is_test: + display_or_save_image(out, args.output_dir) + else: + + def frame_processor(frame: np.ndarray) -> np.ndarray: + return app.predict_landmarks_from_image(frame)[0] # type: ignore + + capture_and_display_processed_frames( + frame_processor, "QAIHM Mediapipe Pose Demo", args.camera + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mediapipe_pose/export.py b/qai_hub_models/models/mediapipe_pose/export.py new file mode 100644 index 00000000..e9d9e4c1 --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/export.py @@ -0,0 +1,215 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Mapping, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.mediapipe_pose import Model +from qai_hub_models.utils.args import ( + export_parser, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + +ALL_COMPONENTS = ["MediaPipePoseDetector", "MediaPipePoseLandmarkDetector"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[ + str, Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] +] | List[str]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` + + Returns: + A Mapping from component_name to a 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "mediapipe_pose" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + component_arg = components + components = components or ALL_COMPONENTS + for component in components: + if component not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "mediapipe_pose", + "MediaPipe-Pose-Estimation", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + component_arg, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + components_dict = {} + if "MediaPipePoseDetector" in components: + components_dict["MediaPipePoseDetector"] = model.pose_detector + if "MediaPipePoseLandmarkDetector" in components: + components_dict["MediaPipePoseLandmarkDetector"] = model.pose_landmark_detector + + compile_jobs = {} + for component_name, component in components_dict.items(): + # Trace the model + input_spec = component.get_input_spec() + source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + + # 2. Compile the models to an on-device asset + model_compile_options = component.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {component_name} to run on-device.") + compile_jobs[component_name] = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=f"{component_name}", + options=model_compile_options, + ) + + # 3. Profile the model assets on real devices + profile_jobs = {} + if not skip_profiling: + for component_name in components: + print(f"Profiling model {component_name} on a hosted device.") + profile_jobs[component_name] = hub.submit_profile_job( + model=compile_jobs[component_name].get_target_model(), + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_jobs = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + sample_inputs = components_dict[component_name].sample_inputs() + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_jobs[component_name] = hub.submit_inference_job( + model=compile_jobs[component_name].get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 5. Download the model assets to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + for component_name, compile_job in compile_jobs.items(): + target_model = compile_job.get_target_model() + target_model.download( + str(output_path / f"{model_name}_{component_name}.tflite") + ) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + for component_name in components: + inference_job = inference_jobs[component_name] + sample_inputs = components_dict[component_name].sample_inputs() + torch_out = torch_inference(components_dict[component_name], sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return { + component_name: ( + compile_jobs[component_name], + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mediapipe_pose/info.yaml b/qai_hub_models/models/mediapipe_pose/info.yaml new file mode 100644 index 00000000..bda0e3d5 --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/info.yaml @@ -0,0 +1,35 @@ +name: MediaPipe-Pose-Estimation +# id must match with the model dir name in qai_hub_models +id: mediapipe_pose +status: public +headline: Detect and track human body poses in real-time images and video streams. +domain: Computer Vision +description: The MediaPipe Pose Landmark Detector is a machine learning pipeline that predicts bounding boxes and pose skeletons of poses in an image. +use_case: Pose Estimation +tags: + - real-time +research_paper: https://arxiv.org/abs/2006.10204 +research_paper_title: "BlazePose: On-device Real-time Body Pose tracking" +license: https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE +source_repo: https://github.com/zmurez/MediaPipePyTorch/ +technical_details: + Encoder Number of parameters: 818K + Decoder Number of parameters: 3.38M + Model size: 17 MB + Input resolution: 256x256 +applicable_scenarios: + - Accessibility + - Augmented Reality + - ARVR +related_models: + - 'mediapipe_hand' + - 'mediapipe_face' + - 'mediapipe_selfie' +form_factors: + - Phone + - Tablet + - IoT +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: [] diff --git a/qai_hub_models/models/mediapipe_pose/model.py b/qai_hub_models/models/mediapipe_pose/model.py new file mode 100644 index 00000000..264b6f9c --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/model.py @@ -0,0 +1,170 @@ +from __future__ import annotations + +from typing import Callable, Tuple + +import torch + +from qai_hub_models.models._shared.mediapipe.utils import MediaPipePyTorchAsRoot +from qai_hub_models.utils.base_model import BaseModel, CollectionModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 + +POSE_LANDMARK_CONNECTIONS = [ + (0, 1), + (1, 2), + (2, 3), + (3, 7), + (0, 4), + (4, 5), + (5, 6), + (6, 8), + (9, 10), + (11, 13), + (13, 15), + (15, 17), + (17, 19), + (19, 15), + (15, 21), + (12, 14), + (14, 16), + (16, 18), + (18, 20), + (20, 16), + (16, 22), + (11, 12), + (12, 24), + (24, 23), + (23, 11), +] + + +# pose detector model parameters. +BATCH_SIZE = 1 +DETECT_SCORE_SLIPPING_THRESHOLD = 100 # Clip output scores to this maximum value. +DETECT_DXY, DETECT_DSCALE = ( + 0, + 1.5, +) # Modifiers applied to pose detector output bounding box to encapsulate the entire pose. +POSE_KEYPOINT_INDEX_START = 2 # The pose detector outputs several keypoints. This is the keypoint index for the bottom. +POSE_KEYPOINT_INDEX_END = 3 # The pose detector outputs several keypoints. This is the keypoint index for the top. +ROTATION_VECTOR_OFFSET_RADS = ( + torch.pi / 2 +) # Offset required when computing rotation of the detected pose. + + +class MediaPipePose(CollectionModel): + def __init__( + self, + pose_detector: PoseDetector, + pose_landmark_detector: PoseLandmarkDetector, + ) -> None: + """ + Construct a mediapipe pose model. + + Inputs: + pose_detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]] + Pose detection model. Input is an image, output is + [bounding boxes & keypoints, box & kp scores] + + pose_landmark_detector + Pose landmark detector model. Input is an image cropped to the posing object. The pose must be upright + and un-tilted in the frame. Returns [landmark_scores, landmarks, mask] + + Note that although the landmark detector returns 3 values, + the third output (mask) is unused by this application. + + """ + super().__init__() + self.pose_detector = pose_detector + self.pose_landmark_detector = pose_landmark_detector + + @classmethod + def from_pretrained( + cls, + detector_weights: str = "blazepose.pth", + detector_anchors: str = "anchors_pose.npy", + landmark_detector_weights: str = "blazepose_landmark.pth", + ) -> MediaPipePose: + """ + Load mediapipe models from the source repository. + Returns tuple[.blazepose.BlazePose, BlazePose Anchors, .blazepose_landmark.BlazePoseLandmark] + """ + with MediaPipePyTorchAsRoot(): + from blazepose import BlazePose + from blazepose_landmark import BlazePoseLandmark + + pose_detector = BlazePose() + pose_detector.load_weights(detector_weights) + pose_detector.load_anchors(detector_anchors) + pose_regressor = BlazePoseLandmark() + pose_regressor.load_weights(landmark_detector_weights) + + return cls( + PoseDetector(pose_detector, pose_detector.anchors), + PoseLandmarkDetector(pose_regressor), + ) + + +class PoseDetector(BaseModel): + def __init__( + self, + detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]], + anchors: torch.Tensor, + ): + super().__init__() + self.detector = detector + self.anchors = anchors + + def forward(self, image: torch.Tensor): + return self.detector(image) + + @classmethod + def from_pretrained( + cls, + detector_weights: str = "blazepose.pth", + detector_anchors: str = "anchors_pose.npy", + ): + with MediaPipePyTorchAsRoot(): + from blazepose import BlazePose + + pose_detector = BlazePose(back_model=True) + pose_detector.load_weights(detector_weights) + pose_detector.load_anchors(detector_anchors) + return cls(pose_detector, pose_detector.anchors) + + def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + """ + Returns the input specification (name -> (shape, type) of the pose detector. + This can be used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, 3, 128, 128), "float32")} + + +class PoseLandmarkDetector(BaseModel): + def __init__( + self, + detector: Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]], + ): + super().__init__() + self.detector = detector + + def forward(self, image: torch.Tensor): + return self.detector(image) + + @classmethod + def from_pretrained(cls, landmark_detector_weights: str = "blazepose_landmark.pth"): + with MediaPipePyTorchAsRoot(): + from blazepose_landmark import BlazePoseLandmark + + pose_regressor = BlazePoseLandmark() + pose_regressor.load_weights(landmark_detector_weights) + cls(pose_regressor) + + def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + """ + Returns the input specification (name -> (shape, type) of the pose landmark detector. + This can be used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, 3, 256, 256), "float32")} diff --git a/qai_hub_models/models/mediapipe_pose/perf.yaml b/qai_hub_models/models/mediapipe_pose/perf.yaml new file mode 100644 index 00000000..7b7ebe7c --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/perf.yaml @@ -0,0 +1,107 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: MediaPipePoseDetector + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 807.0 + throughput: 1239.1573729863692 + estimated_peak_memory_range: + min: 28672 + max: 1641432 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 106 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 106 + job_id: j1p3z1wz5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 865.0 + throughput: 1156.0693641618498 + estimated_peak_memory_range: + min: 212992 + max: 66280848 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 139 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 139 + job_id: j1pvlr9m5 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:22:09.229999Z' +- name: MediaPipePoseLandmarkDetector + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1023.0 + throughput: 977.5171065493646 + estimated_peak_memory_range: + min: 12288 + max: 3253904 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 229 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 229 + job_id: jwgoln4dg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1101.0 + throughput: 908.2652134423251 + estimated_peak_memory_range: + min: 20480 + max: 149395360 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 305 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 305 + job_id: j7gjr2w8p + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:29:24.657545Z' diff --git a/qai_hub_models/models/mediapipe_pose/requirements.txt b/qai_hub_models/models/mediapipe_pose/requirements.txt new file mode 100644 index 00000000..9c11ddeb --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/requirements.txt @@ -0,0 +1,2 @@ +opencv-python +requests diff --git a/qai_hub_models/models/mediapipe_pose/test.py b/qai_hub_models/models/mediapipe_pose/test.py new file mode 100644 index 00000000..c3997393 --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/test.py @@ -0,0 +1,39 @@ +import numpy as np + +from qai_hub_models.models.mediapipe_pose.app import MediaPipePoseApp +from qai_hub_models.models.mediapipe_pose.demo import INPUT_IMAGE_ADDRESS +from qai_hub_models.models.mediapipe_pose.demo import main as demo_main +from qai_hub_models.models.mediapipe_pose.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + MediaPipePose, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "pose_output.png" +) + + +# Because we have not made a modification to the pytorch source network, +# no numerical tests are included for the model; only for the app. + + +@skip_clone_repo_check +def test_pose_app(): + input = load_image( + INPUT_IMAGE_ADDRESS, + ) + expected_output = load_image( + OUTPUT_IMAGE_ADDRESS, + ).convert("RGB") + app = MediaPipePoseApp(MediaPipePose.from_pretrained()) + assert np.allclose( + app.predict_landmarks_from_image(input)[0], np.asarray(expected_output) + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/mediapipe_selfie/README.md b/qai_hub_models/models/mediapipe_selfie/README.md new file mode 100644 index 00000000..70de5777 --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [MediaPipe-Selfie-Segmentation: Segments the person from background in a selfie image and realtime background segmentation in video conferencing](https://aihub.qualcomm.com/models/mediapipe_selfie) + +Light-weight model that segments a person from the background in square or landscape selfie and video conference imagery. + +This is based on the implementation of MediaPipe-Selfie-Segmentation found +[here](https://github.com/google/mediapipe/tree/master/mediapipe/modules/selfie_segmentation). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/mediapipe_selfie). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[mediapipe_selfie]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.mediapipe_selfie.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.mediapipe_selfie.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of MediaPipe-Selfie-Segmentation can be found + [here](https://github.com/google/mediapipe/blob/master/LICENSE). + + +## References +* [Image segmentation guide](https://developers.google.com/mediapipe/solutions/vision/image_segmenter/) +* [Source Model Implementation](https://github.com/google/mediapipe/tree/master/mediapipe/modules/selfie_segmentation) diff --git a/qai_hub_models/models/mediapipe_selfie/__init__.py b/qai_hub_models/models/mediapipe_selfie/__init__.py new file mode 100644 index 00000000..7be47721 --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/__init__.py @@ -0,0 +1,2 @@ +from .model import MODEL_ID # noqa: F401 +from .model import SelfieSegmentation as Model # noqa: F401 diff --git a/qai_hub_models/models/mediapipe_selfie/app.py b/qai_hub_models/models/mediapipe_selfie/app.py new file mode 100644 index 00000000..b34052fe --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/app.py @@ -0,0 +1,40 @@ +from typing import Callable + +import numpy as np +import torch +from PIL.Image import Image + +from qai_hub_models.utils.image_processing import preprocess_PIL_image + +RESIZE_SHAPE = (256, 256) + + +class SelfieSegmentationApp: + """ + This class consists of light-weight "app code" that is required to + perform end to end inference with UNet. + + For a given image input, the app will: + * Pre-process the image (resize and normalize) + * Run Selfie Segmentation model inference + * Convert the raw output into segmented image. + """ + + def __init__(self, model: Callable[[torch.Tensor], torch.Tensor]): + self.model = model + + def predict(self, image: Image) -> np.ndarray: + """ + From the provided image or tensor, generate the segmented mask. + + Parameters: + image: A PIL Image in RGB format. + + Returns: + mask: Segmented mask as np.array. + """ + image_tensor = preprocess_PIL_image(image.resize(RESIZE_SHAPE)) + output = self.model(image_tensor) + output = np.clip(np.reshape(output[0].detach().numpy(), (256, 256)), 0, 1) + + return output diff --git a/qai_hub_models/models/mediapipe_selfie/demo.py b/qai_hub_models/models/mediapipe_selfie/demo.py new file mode 100644 index 00000000..bc098341 --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/demo.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from typing import Type + +from PIL.Image import fromarray + +from qai_hub_models.models.mediapipe_selfie.app import SelfieSegmentationApp +from qai_hub_models.models.mediapipe_selfie.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + SelfieSegmentation, +) +from qai_hub_models.utils.args import ( + add_output_dir_arg, + get_model_cli_parser, + model_from_cli_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.display import display_or_save_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "selfie.jpg" +) + + +# Run selfie segmentation app end-to-end on a sample image. +# The demo will display the predicted mask in a window. +def mediapipe_selfie_demo( + model_cls: Type[BaseModel], + default_image: str | CachedWebModelAsset, + is_test: bool = False, +): + # Demo parameters + parser = get_model_cli_parser(model_cls) + parser.add_argument( + "--image", + type=str, + default=default_image, + help="File path or URL to an input image to use for the demo.", + ) + add_output_dir_arg(parser) + args = parser.parse_args([] if is_test else None) + + # Load image & model + model = model_from_cli_args(model_cls, args) + print("Model loaded from pre-trained weights.") + image = load_image(args.image, verbose=True, desc="sample input image") + + # Run app + app = SelfieSegmentationApp(model) + mask = app.predict(image) * 255.0 + mask = fromarray(mask).convert("L") + if not is_test: + # Make sure the input image and mask are resized so the demo can visually + # show the images in the same resolution. + image = image.resize(mask.size) + display_or_save_image( + image, args.output_dir, "mediapipe_selfie_image.png", "sample input image" + ) + display_or_save_image( + mask, args.output_dir, "mediapipe_selfie_mask.png", "predicted mask" + ) + + +def main(is_test: bool = False): + mediapipe_selfie_demo( + SelfieSegmentation, + IMAGE_ADDRESS, + is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mediapipe_selfie/export.py b/qai_hub_models/models/mediapipe_selfie/export.py new file mode 100644 index 00000000..9442d847 --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.mediapipe_selfie import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "mediapipe_selfie" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "mediapipe_selfie", + "MediaPipe-Selfie-Segmentation", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mediapipe_selfie/info.yaml b/qai_hub_models/models/mediapipe_selfie/info.yaml new file mode 100644 index 00000000..3150ef33 --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/info.yaml @@ -0,0 +1,36 @@ +name: MediaPipe-Selfie-Segmentation +# id must match with the model dir name in qai_hub_models +id: mediapipe_selfie +status: public +headline: Segments the person from background in a selfie image and realtime background segmentation in video conferencing. +domain: Computer Vision +description: Light-weight model that segments a person from the background in square or landscape selfie and video conference imagery. +use_case: Semantic Segmentation +tags: [] +research_paper: https://developers.google.com/mediapipe/solutions/vision/image_segmenter/ +research_paper_title: "Image segmentation guide" +license: https://github.com/google/mediapipe/blob/master/LICENSE +source_repo: https://github.com/google/mediapipe/tree/master/mediapipe/modules/selfie_segmentation +technical_details: + Number of parameters: 100K + Model size: 0.5MB + Model checkpoint: Square + Input resolution (Square): 256x256 + Input resolution (Landscape): 144x256 +applicable_scenarios: + - Camera + - Instant Photo Studio + - Video Conferencing + - Personalized Marketing Content + - Interactive Gaming Avatar + - Real-time Portrait Editor + - Dynamic Wallpaper Generator +related_models: + - 'sam' +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: [] diff --git a/qai_hub_models/models/mediapipe_selfie/model.py b/qai_hub_models/models/mediapipe_selfie/model.py new file mode 100644 index 00000000..e474b59c --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/model.py @@ -0,0 +1,330 @@ +from __future__ import annotations + +from tflite import Model +from torch import nn + +from qai_hub_models.models.mediapipe_selfie.utils import ( + build_state_dict, + get_convert, + get_probable_names, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +MEDIAPIPE_SELFIE_CKPT_MAP = dict( + square=CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "weights/selfie_segmentation.tflite" + ), + landscape=CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "weights/selfie_segmentation_landscape.tflite" + ), +) +DEFAULT_IMAGE_TYPE = "square" + + +class DepthwiseConv2d(nn.Module): + def __init__(self, in_channels, kernel_size=3, stride=2, padding=1): + super(DepthwiseConv2d, self).__init__() + self.depthwise = nn.Conv2d( + in_channels, + in_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=in_channels, + ) + + def forward(self, x): + x = self.depthwise(x) + return x + + +class SelfieSegmentation(BaseModel): + """Reconstruct the selfie segmentation graph for square as well as landscape image.""" + + def __init__(self, image_type: str = "square"): + """ + Parameters: + image_type: str (choices: square or landscape) + Instance of two model variations can be created: + * One for square images (H=W) + * One for rectangle images (landscape format) + + Returns: + graph: Based on the image type, torch.nn.Module is returned. + The only difference in architectures is that global average pool + is only present in the model trained for landscape images. + + """ + if image_type not in ["square", "landscape"]: + raise ValueError(f"Unsupported image type {image_type}") + + super(SelfieSegmentation, self).__init__() + self.image_type = image_type + self.allow_avg = image_type != "landscape" + self.relu = nn.ReLU(inplace=True) + self.hardswish = nn.Hardswish() + self.sigmoid = nn.Sigmoid() + + self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1) + self.conv2 = nn.Conv2d(16, 16, 1) + self.depthwise1 = DepthwiseConv2d(16, 3, 2, 1) + if self.allow_avg: + self.avgpool1 = nn.AvgPool2d(kernel_size=64, stride=64, padding=0) + self.conv3 = nn.Conv2d(16, 8, kernel_size=1, stride=1, padding=0) + self.conv4 = nn.Conv2d(8, 16, kernel_size=1, stride=1, padding=0) + + self.conv5 = nn.Conv2d(16, 16, 1) + self.conv6 = nn.Conv2d(16, 72, 1) + self.depthwise2 = DepthwiseConv2d(72, 3, 2, 1) + + self.conv7 = nn.Conv2d(72, 24, 1) + self.conv8 = nn.Conv2d(24, 88, 1) + self.depthwise3 = DepthwiseConv2d(88, 3, 1, 1) + self.conv9 = nn.Conv2d(88, 24, 1) + + self.conv10 = nn.Conv2d(24, 96, kernel_size=1, stride=1, padding=0) + self.depthwise4 = DepthwiseConv2d(96, 5, 2, 2) + + if self.allow_avg: + self.avgpool2 = nn.AvgPool2d(kernel_size=16, stride=16, padding=0) + self.conv11 = nn.Conv2d(96, 24, kernel_size=1, stride=1, padding=0) + self.conv12 = nn.Conv2d(24, 96, kernel_size=1, stride=1, padding=0) + self.conv13 = nn.Conv2d(96, 32, 1) + + self.conv14 = nn.Conv2d(32, 128, kernel_size=1, stride=1, padding=0) + self.depthwise5 = DepthwiseConv2d(128, 5, 1, 2) + if self.allow_avg: + self.avgpool3 = nn.AvgPool2d(kernel_size=16, stride=16, padding=0) + self.conv15 = nn.Conv2d(128, 32, kernel_size=1, stride=1, padding=0) + self.conv16 = nn.Conv2d(32, 128, kernel_size=1, stride=1, padding=0) + + self.conv17 = nn.Conv2d(128, 32, 1) + + self.conv18 = nn.Conv2d(32, 128, kernel_size=1, stride=1, padding=0) + self.depthwise6 = DepthwiseConv2d(128, 5, 1, 2) + self.avgpool4 = nn.AvgPool2d(kernel_size=16, stride=16, padding=0) + self.conv19 = nn.Conv2d(128, 32, kernel_size=1, stride=1, padding=0) + self.conv20 = nn.Conv2d(32, 128, kernel_size=1, stride=1, padding=0) + + self.conv21 = nn.Conv2d(128, 32, 1) + + self.conv22 = nn.Conv2d(32, 96, kernel_size=1, stride=1, padding=0) + self.depthwise7 = DepthwiseConv2d(96, 5, 1, 2) + if self.allow_avg: + self.avgpool5 = nn.AvgPool2d(kernel_size=16, stride=16, padding=0) + self.conv23 = nn.Conv2d(96, 24, kernel_size=1, stride=1, padding=0) + self.conv24 = nn.Conv2d(24, 96, kernel_size=1, stride=1, padding=0) + + self.conv25 = nn.Conv2d(96, 32, 1) + + self.conv26 = nn.Conv2d(32, 96, kernel_size=1, stride=1, padding=0) + self.depthwise8 = DepthwiseConv2d(96, 5, 1, 2) + self.avgpool6 = nn.AvgPool2d(kernel_size=16, stride=16, padding=0) + self.conv27 = nn.Conv2d(96, 24, kernel_size=1, stride=1, padding=0) + self.conv28 = nn.Conv2d(24, 96, kernel_size=1, stride=1, padding=0) + + self.conv29 = nn.Conv2d(96, 32, 1) + + self.conv30 = nn.Conv2d(32, 128, 1) + if self.allow_avg: + self.avgpool6 = nn.AvgPool2d(kernel_size=16, stride=16, padding=0) + self.conv31 = nn.Conv2d(32, 128, kernel_size=1, stride=1, padding=0) + self.conv32 = nn.Conv2d(128, 24, 1) + if self.allow_avg: + self.avgpool7 = nn.AvgPool2d(kernel_size=32, stride=32, padding=0) + self.conv33 = nn.Conv2d(24, 24, 1) + self.conv34 = nn.Conv2d(24, 24, 1) + self.conv35 = nn.Conv2d(24, 24, 1) + self.depthwise9 = DepthwiseConv2d(24, 3, 1, 1) + + self.conv36 = nn.Conv2d(24, 16, 1) + self.avgpool8 = nn.AvgPool2d(kernel_size=64, stride=64, padding=0) + self.conv37 = nn.Conv2d(16, 16, 1) + self.conv38 = nn.Conv2d(16, 16, 1) + self.conv39 = nn.Conv2d(16, 16, 1) + self.depthwise10 = DepthwiseConv2d(16, 3, 1, 1) + + self.conv40 = nn.Conv2d(16, 16, 1) + if self.allow_avg: + self.avgpool9 = nn.AvgPool2d(kernel_size=128, stride=128, padding=0) + self.conv41 = nn.Conv2d(16, 16, 1) + self.conv42 = nn.Conv2d(16, 16, 1) + self.conv43 = nn.Conv2d(16, 16, 1) + self.depthwise11 = DepthwiseConv2d(16, 3, 1, 1) + self.transpose_conv = nn.ConvTranspose2d(16, 1, 2, 2, 0) + + self.upsample = nn.Upsample(scale_factor=2, mode="bilinear") + + @classmethod + def from_pretrained(cls, image_type: str = DEFAULT_IMAGE_TYPE): + """ + Load the TFLite weights and convert them to PyTorch checkpoint. + Weights for square input are different from landscape input. + Hence, based on image_type different weights are loaded and + different model instance is returned. + + Parameters: + image_type: str (choices: square or landscape) + Instance of two model variations can be created: + * One for square images (H=W) + * One for rectangle images (landscape format) + Returns: + Torch model with pretrained weights loaded. + """ + front_net = cls(image_type) + destination_path = MEDIAPIPE_SELFIE_CKPT_MAP[image_type].fetch() + front_data = open(destination_path, "rb").read() + front_model = Model.GetRootAsModel(front_data, 0) + front_subgraph = front_model.Subgraphs(0) + front_tensor_dict = { + (front_subgraph.Tensors(i).Name().decode("utf8")): i + for i in range(front_subgraph.TensorsLength()) + } + + front_probable_names = get_probable_names(front_subgraph) + front_convert = get_convert(front_net, front_probable_names) + front_state_dict = build_state_dict( + front_model, front_subgraph, front_tensor_dict, front_net, front_convert + ) + front_net.load_state_dict(front_state_dict, strict=True) + return front_net + + def get_input_spec(self, batch_size: int = 1) -> InputSpec: + if self.image_type == "square": + height, width = 256, 256 + else: + height, width = 144, 256 + return {"image": ((batch_size, 3, height, width), "float32")} + + def forward(self, image): + """ + Parameters: + image: Input image to be segmented. + Square: Shape [1, 3, 256, 256] + Landscape: Shape [1, 3, 144, 256] + Channel layout: RGB + + Returns: + output (mask): Mask with person and the background segmented. + Square: Shape [1, 256, 256] + Landscape: Shape [1, 144, 256] + + """ + x = self.hardswish(self.conv1(image)) + x1 = x + x = self.relu(self.conv2(x)) + x = self.relu(self.depthwise1(x)) + x1_1 = x + if self.allow_avg: + x = self.avgpool1(x) + x = self.relu(self.conv3(x)) + x = self.sigmoid(self.conv4(x)) * x1_1 + x = self.conv5(x) + x3 = x + x = self.relu(self.conv6(x)) + x = self.relu(self.depthwise2(x)) + x = self.conv7(x) + x4 = x + x = self.relu(self.conv8(x)) + x = self.relu(self.depthwise3(x)) + x = self.conv9(x) + x = x + x4 + x2 = x + x = self.hardswish(self.conv10(x)) + x = self.hardswish(self.depthwise4(x)) + x2_2 = x + if self.allow_avg: + x = self.avgpool2(x) + x = self.relu(self.conv11(x)) + x = self.sigmoid(self.conv12(x)) * x2_2 + x = self.conv13(x) + + x5 = x + x = self.hardswish(self.conv14(x)) + x = self.hardswish(self.depthwise5(x)) + x3_3 = x + if self.allow_avg: + x = self.avgpool3(x) + x = self.relu(self.conv15(x)) + x = self.sigmoid(self.conv16(x)) * x3_3 + x = self.conv17(x) + x = x + x5 + + x5 = x + x = self.hardswish(self.conv18(x)) + x = self.hardswish(self.depthwise6(x)) + x4_4 = x + if self.allow_avg: + x = self.avgpool4(x) + x = self.relu(self.conv19(x)) + x = self.sigmoid(self.conv20(x)) * x4_4 + x = self.conv21(x) + x = x + x5 + + x5 = x + x = self.hardswish(self.conv22(x)) + x = self.hardswish(self.depthwise7(x)) + x5_5 = x + if self.allow_avg: + x = self.avgpool5(x) + x = self.relu(self.conv23(x)) + x = self.sigmoid(self.conv24(x)) * x5_5 + x = self.conv25(x) + x = x + x5 + + x5 = x + x = self.hardswish(self.conv26(x)) + x = self.hardswish(self.depthwise8(x)) + x6_6 = x + if self.allow_avg: + x = self.avgpool6(x) + x = self.relu(self.conv27(x)) + x = self.sigmoid(self.conv28(x)) * x6_6 + x = self.conv29(x) + x = x + x5 + + x7_7 = x + + x = self.relu(self.conv30(x)) + if self.allow_avg: + x7_7 = self.avgpool6(x7_7) + + x = x * self.sigmoid(self.conv31(x7_7)) + + x = self.upsample(x) + x6 = self.conv32(x) + x = x2 + x6 + if self.allow_avg: + x = self.avgpool7(x) + x = x6 + x2 * self.sigmoid(self.conv34(self.relu(self.conv33(x)))) + x7 = self.relu(self.conv35(x)) + x = x7 + self.relu(self.depthwise9(x7)) + + x = self.upsample(x) + x = self.conv36(x) + x8 = x + x = x3 + x8 + if self.allow_avg: + x = self.avgpool8(x) + x = x8 + x3 * self.sigmoid(self.conv38(self.relu(self.conv37(x)))) + x = self.relu(self.conv39(x)) + x9 = x + x = x9 + self.relu(self.depthwise10(x9)) + + x = self.upsample(x) + x = self.conv40(x) + x10 = x + x = x10 + x1 + if self.allow_avg: + x = self.avgpool9(x) + x = x10 + x1 * self.sigmoid(self.conv42(self.relu(self.conv41(x)))) + x11 = self.relu(self.conv43(x)) + x = x11 + self.relu(self.depthwise11(x11)) + + x = self.sigmoid(self.transpose_conv(x)) + + return x diff --git a/qai_hub_models/models/mediapipe_selfie/perf.yaml b/qai_hub_models/models/mediapipe_selfie/perf.yaml new file mode 100644 index 00000000..f2615794 --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: MediaPipe-Selfie-Segmentation + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 817.0 + throughput: 1223.9902080783354 + estimated_peak_memory_range: + min: 12288 + max: 1802840 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 118 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 118 + job_id: jygzljvz5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 801.0 + throughput: 1248.4394506866417 + estimated_peak_memory_range: + min: 811008 + max: 91168416 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 139 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 139 + job_id: jz5wl3mzp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:39:49.005922Z' diff --git a/qai_hub_models/models/mediapipe_selfie/requirements.txt b/qai_hub_models/models/mediapipe_selfie/requirements.txt new file mode 100644 index 00000000..a97d948f --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/requirements.txt @@ -0,0 +1 @@ +tflite==2.10.0 diff --git a/qai_hub_models/models/mediapipe_selfie/test.py b/qai_hub_models/models/mediapipe_selfie/test.py new file mode 100644 index 00000000..bbd351d8 --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/test.py @@ -0,0 +1,38 @@ +import numpy as np + +from qai_hub_models.models.mediapipe_selfie.app import SelfieSegmentationApp +from qai_hub_models.models.mediapipe_selfie.demo import IMAGE_ADDRESS +from qai_hub_models.models.mediapipe_selfie.demo import main as demo_main +from qai_hub_models.models.mediapipe_selfie.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + SelfieSegmentation, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "selfie_output.jpg" +) + + +def test_output(): + input_img = load_image( + IMAGE_ADDRESS, + ) + model = SelfieSegmentation.from_pretrained() + output = SelfieSegmentationApp(model).predict(input_img) + expected_output = load_image( + OUTPUT_IMAGE_ADDRESS, + ).convert("L") + + expected_output = np.array(expected_output) + np.testing.assert_allclose( + np.round(np.asarray(expected_output, dtype=np.float32) / 255, 2), + np.round(np.asarray(output, dtype=np.float32), 2), + rtol=0.1, + atol=0.1, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/mediapipe_selfie/utils.py b/qai_hub_models/models/mediapipe_selfie/utils.py new file mode 100644 index 00000000..ab69b5c5 --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/utils.py @@ -0,0 +1,73 @@ +# Source: https://github.com/hollance/BlazeFace-PyTorch/blob/master/Convert.ipynb +from collections import OrderedDict + +import numpy as np +import torch + + +def get_shape(tensor): + """Get shape for a TFLIte tensor.""" + return [tensor.Shape(i) for i in range(tensor.ShapeLength())] + + +def get_parameters(graph): + """Get parameters for a TFLite graph.""" + parameters = {} + for i in range(graph.TensorsLength()): + tensor = graph.Tensors(i) + if tensor.Buffer() > 0: + name = tensor.Name().decode("utf8") + parameters[name] = tensor.Buffer() + return parameters + + +def get_weights(model, graph, tensor_dict, tensor_name): + """Get weights using tensor name.""" + i = tensor_dict[tensor_name] + tensor = graph.Tensors(i) + buffer = tensor.Buffer() + shape = get_shape(tensor) + assert tensor.Type() == 1 + W = model.Buffers(buffer).DataAsNumpy() + W = W.view(dtype=np.float16) + W = W.reshape(shape) + return W + + +def get_probable_names(graph): + """Get the probable names for nodes in a graph.""" + probable_names = [] + for i in range(0, graph.TensorsLength()): + tensor = graph.Tensors(i) + if tensor.Buffer() > 0 and (tensor.Type() == 0 or tensor.Type() == 1): + probable_names.append(tensor.Name().decode("utf-8")) + return probable_names + + +def get_convert(net, probable_names): + """Convert state dict using probable node names.""" + convert = {} + i = 0 + for name, params in net.state_dict().items(): + convert[name] = probable_names[i] + i += 1 + return convert + + +def build_state_dict(model, graph, tensor_dict, net, convert): + """ + Building the state dict for PyTorch graph. A few layers + will need their weights to be transformed like Convolutions + and Depthwise Convolutions. + """ + new_state_dict = OrderedDict() + for dst, src in convert.items(): + W = get_weights(model, graph, tensor_dict, src) + if W.ndim == 4: + if W.shape[0] == 1: + W = W.transpose((3, 0, 1, 2)) # depthwise conv + else: + W = W.transpose((0, 3, 1, 2)) # regular conv + + new_state_dict[dst] = torch.from_numpy(np.array(W)) + return new_state_dict diff --git a/qai_hub_models/models/mnasnet05/README.md b/qai_hub_models/models/mnasnet05/README.md new file mode 100644 index 00000000..0db50a4c --- /dev/null +++ b/qai_hub_models/models/mnasnet05/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [MNASNet05: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/mnasnet05) + +MNASNet05 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of MNASNet05 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/mnasnet05). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.mnasnet05.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.mnasnet05.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of MNASNet05 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [MnasNet: Platform-Aware Neural Architecture Search for Mobile](https://arxiv.org/abs/1807.11626) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py) diff --git a/qai_hub_models/models/mnasnet05/__init__.py b/qai_hub_models/models/mnasnet05/__init__.py new file mode 100644 index 00000000..2e1c9a3f --- /dev/null +++ b/qai_hub_models/models/mnasnet05/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import MNASNet05 as Model # noqa: F401 diff --git a/qai_hub_models/models/mnasnet05/demo.py b/qai_hub_models/models/mnasnet05/demo.py new file mode 100644 index 00000000..55cfc2de --- /dev/null +++ b/qai_hub_models/models/mnasnet05/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.mnasnet05.model import MNASNet05 + + +def main(is_test: bool = False): + imagenet_demo(MNASNet05, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mnasnet05/export.py b/qai_hub_models/models/mnasnet05/export.py new file mode 100644 index 00000000..323e6f0f --- /dev/null +++ b/qai_hub_models/models/mnasnet05/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.mnasnet05 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "mnasnet05" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "mnasnet05", + "MNASNet05", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mnasnet05/info.yaml b/qai_hub_models/models/mnasnet05/info.yaml new file mode 100644 index 00000000..ba8994d3 --- /dev/null +++ b/qai_hub_models/models/mnasnet05/info.yaml @@ -0,0 +1,38 @@ +name: MNASNet05 +# id must match with the model dir name in qai_hub_models +id: mnasnet05 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: MNASNet05 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone +research_paper: https://arxiv.org/abs/1807.11626 +research_paper_title: "MnasNet: Platform-Aware Neural Architecture Search for Mobile" +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py +technical_details: + Number of parameters: 2.22M + Model size: 8.59 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/mnasnet05/model.py b/qai_hub_models/models/mnasnet05/model.py new file mode 100644 index 00000000..f98dc692 --- /dev/null +++ b/qai_hub_models/models/mnasnet05/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class MNASNet05(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.mnasnet0_5(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/mnasnet05/perf.yaml b/qai_hub_models/models/mnasnet05/perf.yaml new file mode 100644 index 00000000..8282b9b2 --- /dev/null +++ b/qai_hub_models/models/mnasnet05/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: MNASNet05 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 370.0 + throughput: 2702.7027027027025 + estimated_peak_memory_range: + min: 12288 + max: 8955784 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 69 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 69 + job_id: jmg9zyxvp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 367.0 + throughput: 2724.7956403269754 + estimated_peak_memory_range: + min: 196608 + max: 36330664 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 102 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 102 + job_id: jnp1nwvlg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:13:59.738307Z' diff --git a/qai_hub_models/models/mnasnet05/test.py b/qai_hub_models/models/mnasnet05/test.py new file mode 100644 index 00000000..638c2a8a --- /dev/null +++ b/qai_hub_models/models/mnasnet05/test.py @@ -0,0 +1,21 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.mnasnet05.demo import main as demo_main +from qai_hub_models.models.mnasnet05.model import MODEL_ID, MNASNet05 + + +def test_task(): + run_imagenet_classifier_test( + MNASNet05.from_pretrained(), MODEL_ID, probability_threshold=0.69 + ) + + +def test_trace(): + run_imagenet_classifier_trace_test(MNASNet05.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/mobilenet_v2/README.md b/qai_hub_models/models/mobilenet_v2/README.md new file mode 100644 index 00000000..a207207f --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [MobileNet-v2: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/mobilenet_v2) + +MobileNetV2 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of MobileNet-v2 found +[here](https://github.com/tonylins/pytorch-mobilenet-v2/tree/master). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/mobilenet_v2). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.mobilenet_v2.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.mobilenet_v2.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of MobileNet-v2 can be found + [here](https://github.com/tonylins/pytorch-mobilenet-v2/blob/master/LICENSE). + + +## References +* [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) +* [Source Model Implementation](https://github.com/tonylins/pytorch-mobilenet-v2/tree/master) diff --git a/qai_hub_models/models/mobilenet_v2/__init__.py b/qai_hub_models/models/mobilenet_v2/__init__.py new file mode 100644 index 00000000..37bbf5cb --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import MobileNetV2 as Model # noqa: F401 diff --git a/qai_hub_models/models/mobilenet_v2/demo.py b/qai_hub_models/models/mobilenet_v2/demo.py new file mode 100644 index 00000000..adb7ebc9 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.mobilenet_v2.model import MobileNetV2 + + +def main(is_test: bool = False): + imagenet_demo(MobileNetV2, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mobilenet_v2/export.py b/qai_hub_models/models/mobilenet_v2/export.py new file mode 100644 index 00000000..434941a6 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.mobilenet_v2 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "mobilenet_v2" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "mobilenet_v2", + "MobileNet-v2", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mobilenet_v2/info.yaml b/qai_hub_models/models/mobilenet_v2/info.yaml new file mode 100644 index 00000000..b7a92f0c --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2/info.yaml @@ -0,0 +1,40 @@ +name: MobileNet-v2 +# id must match with the model dir name in qai_hub_models +id: mobilenet_v2 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: MobileNetV2 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone + - real-time +research_paper: https://arxiv.org/abs/1801.04381 +research_paper_title: "MobileNetV2: Inverted Residuals and Linear Bottlenecks" +license: https://github.com/tonylins/pytorch-mobilenet-v2/blob/master/LICENSE +source_repo: https://github.com/tonylins/pytorch-mobilenet-v2/tree/master +technical_details: + Number of parameters: 3.50M + Model size: 13.6 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2_quantized + - squeezenet1_1 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/mobilenet_v2/model.py b/qai_hub_models/models/mobilenet_v2/model.py new file mode 100644 index 00000000..11f0ded7 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2/model.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import json + +import torch + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, SourceAsRoot + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +MOBILENETV2_WEIGHTS = "mobilenet_v2.pth.tar" +# MOBILENETV2_WEIGHTS = "torch_mobilenetv2_w8a8_state_dict.pth" +# from https://github.com/quic/aimet-model-zoo/blob/d09d2b0404d10f71a7640a87e9d5e5257b028802/aimet_zoo_torch/mobilenetv2/model/model_cards/mobilenetv2_w8a8.json +MOBILENETV2_CFG = "mobilenetv2_w8a8.json" +MOBILENETV2_SOURCE_REPOSITORY = "https://github.com/tonylins/pytorch-mobilenet-v2" +MOBILENETV2_SOURCE_REPO_COMMIT = "99f213657e97de463c11c9e0eaca3bda598e8b3f" + + +class MobileNetV2(ImagenetClassifier): + def __init__( + self, + mobilenet_v2_model: torch.nn.Module, + ) -> None: + super().__init__(mobilenet_v2_model) + + @classmethod + def from_pretrained(cls) -> MobileNetV2: + model = _load_mobilenet_v2_source_model() + checkpoint_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, MOBILENETV2_WEIGHTS + ).fetch() + checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu")) + # rename classifier.1.weight -> classifier.weight, and bias similarly + state_dict = { + k.replace("classifier.1", "classifier"): v for k, v in checkpoint.items() + } + model.load_state_dict(state_dict) + model.eval() + + return cls(model) + + +def _load_mobilenet_v2_source_model( + keep_sys_path=False, +) -> torch.nn.Module: + cfg_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, MOBILENETV2_CFG + ).fetch() + with open(cfg_path, "r") as f: + cfg = json.load(f) + with SourceAsRoot( + MOBILENETV2_SOURCE_REPOSITORY, + MOBILENETV2_SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + keep_sys_path=keep_sys_path, + ): + # necessary import. `modeling.deeplab` comes from the DeepLabV3 repo. + from MobileNetV2 import MobileNetV2 as _MobileNetV2 + + return _MobileNetV2( + n_class=cfg["model_args"]["num_classes"], + input_size=cfg["model_args"]["input_size"], + width_mult=cfg["model_args"]["width_mult"], + ) diff --git a/qai_hub_models/models/mobilenet_v2/perf.yaml b/qai_hub_models/models/mobilenet_v2/perf.yaml new file mode 100644 index 00000000..7aa2220c --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: MobileNet-v2 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 533.0 + throughput: 1876.172607879925 + estimated_peak_memory_range: + min: 20480 + max: 1466112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 70 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 70 + job_id: jep2r9vmg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 809.0 + throughput: 1236.0939431396787 + estimated_peak_memory_range: + min: 618496 + max: 5733064 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 104 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 104 + job_id: jqpyoj745 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:29:39.371442Z' diff --git a/qai_hub_models/models/mobilenet_v2/test.py b/qai_hub_models/models/mobilenet_v2/test.py new file mode 100644 index 00000000..6ff41b76 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2/test.py @@ -0,0 +1,31 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.mobilenet_v2.demo import main as demo_main +from qai_hub_models.models.mobilenet_v2.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + MobileNetV2, +) +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_task(): + run_imagenet_classifier_test( + MobileNetV2.from_pretrained(), + MODEL_ID, + asset_version=MODEL_ASSET_VERSION, + probability_threshold=0.39, + ) + + +@skip_clone_repo_check +def test_trace(): + run_imagenet_classifier_trace_test(MobileNetV2.from_pretrained()) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/mobilenet_v2_quantized/README.md b/qai_hub_models/models/mobilenet_v2_quantized/README.md new file mode 100644 index 00000000..b95cd4f0 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [MobileNet-v2-Quantized: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/mobilenet_v2_quantized) + +MobileNetV2 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of MobileNet-v2-Quantized found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/mobilenetv2). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/mobilenet_v2_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.mobilenet_v2_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.mobilenet_v2_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of MobileNet-v2-Quantized can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). + + +## References +* [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/mobilenetv2) diff --git a/qai_hub_models/models/mobilenet_v2_quantized/__init__.py b/qai_hub_models/models/mobilenet_v2_quantized/__init__.py new file mode 100644 index 00000000..b60bccbb --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2_quantized/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import MobileNetV2Quantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/mobilenet_v2_quantized/demo.py b/qai_hub_models/models/mobilenet_v2_quantized/demo.py new file mode 100644 index 00000000..bc9c1422 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2_quantized/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable + + +def main(is_test: bool = False): + imagenet_demo(MobileNetV2Quantizable, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mobilenet_v2_quantized/export.py b/qai_hub_models/models/mobilenet_v2_quantized/export.py new file mode 100644 index 00000000..7f1922c2 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2_quantized/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.mobilenet_v2_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "mobilenet_v2_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "mobilenet_v2_quantized", + "MobileNet-v2-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mobilenet_v2_quantized/info.yaml b/qai_hub_models/models/mobilenet_v2_quantized/info.yaml new file mode 100644 index 00000000..d7be9942 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2_quantized/info.yaml @@ -0,0 +1,40 @@ +name: MobileNet-v2-Quantized +id: mobilenet_v2_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: MobileNetV2 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone + - real-time + - quantized +research_paper: https://arxiv.org/abs/1801.04381 +research_paper_title: "MobileNetV2: Inverted Residuals and Linear Bottlenecks" +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/mobilenetv2 +technical_details: + Number of parameters: 3.50M + Model size: 13.6 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - squeezenet1_1 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/mobilenet_v2_quantized/model.py b/qai_hub_models/models/mobilenet_v2_quantized/model.py new file mode 100644 index 00000000..82ce982b --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2_quantized/model.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.mobilenet_v2.model import ( + MobileNetV2, + _load_mobilenet_v2_source_model, +) +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 2 + +# Weights downloaded from https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/torch_mobilenetv2_w8a8_state_dict.pth +QUANTIZED_WEIGHTS = "torch_mobilenetv2_w8a8_state_dict.pth" +DEFAULT_ENCODINGS = "encodings.json" + + +class MobileNetV2Quantizable(AIMETQuantizableMixin, MobileNetV2): + """MobileNetV2 with post train quantization support.""" + + def __init__( + self, + quant_sim_model: QuantizationSimModel, + ) -> None: + MobileNetV2.__init__(self, quant_sim_model.model) + AIMETQuantizableMixin.__init__( + self, + quant_sim_model, + ) + + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "MobileNetV2Quantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + # Load Model + model_fp32 = _load_mobilenet_v2_source_model( + keep_sys_path=True, + ) + input_shape = MobileNetV2(None).get_input_spec()["image_tensor"][0] + # Following + # https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/mobilenetv2/model/model_definition.py#L64 + equalize_model(model_fp32, input_shape) + + # Download weights and quantization parameters + weights = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS + ).fetch() + aimet_config = get_default_aimet_config() + + # Load the QAT/PTQ tuned model_fp32 weights + checkpoint = torch.load(weights, map_location=torch.device("cpu")) + state_dict = { + k.replace("classifier.1", "classifier"): v + for k, v in checkpoint["state_dict"].items() + } + model_fp32.load_state_dict(state_dict) + sim = QuantizationSimModel( + model_fp32, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=aimet_config, + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) + + def get_hub_compile_options( + self, + target_runtime: TargetRuntime, + other_compile_options: str = "", + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" diff --git a/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml b/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml new file mode 100644 index 00000000..9521afec --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: MobileNet-v2-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 240.0 + throughput: 4166.666666666667 + estimated_peak_memory_range: + min: 12288 + max: 1557248 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 70 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 70 + job_id: j1p8em3zp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:15:21.382192Z' diff --git a/qai_hub_models/models/mobilenet_v2_quantized/test.py b/qai_hub_models/models/mobilenet_v2_quantized/test.py new file mode 100644 index 00000000..fd4ff6a7 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2_quantized/test.py @@ -0,0 +1,37 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.mobilenet_v2_quantized.demo import main as demo_main +from qai_hub_models.models.mobilenet_v2_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + MobileNetV2Quantizable, +) +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_task(): + run_imagenet_classifier_test( + MobileNetV2Quantizable.from_pretrained(), + MODEL_ID, + asset_version=MODEL_ASSET_VERSION, + probability_threshold=0.56, + diff_tol=0.06, + ) + + +@skip_clone_repo_check +def test_trace(): + run_imagenet_classifier_trace_test( + MobileNetV2Quantizable.from_pretrained(), + is_quantized=True, + atol=0.03, + ) + + +@skip_clone_repo_check +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/mobilenet_v3_large/README.md b/qai_hub_models/models/mobilenet_v3_large/README.md new file mode 100644 index 00000000..130a4e3c --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [MobileNet-v3-Large: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/mobilenet_v3_large) + +MobileNetV3Large is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of MobileNet-v3-Large found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/mobilenet_v3_large). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.mobilenet_v3_large.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.mobilenet_v3_large.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of MobileNet-v3-Large can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Searching for MobileNetV3](https://arxiv.org/abs/1905.02244) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py) diff --git a/qai_hub_models/models/mobilenet_v3_large/__init__.py b/qai_hub_models/models/mobilenet_v3_large/__init__.py new file mode 100644 index 00000000..82352da8 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import MobileNetV3Large as Model # noqa: F401 diff --git a/qai_hub_models/models/mobilenet_v3_large/demo.py b/qai_hub_models/models/mobilenet_v3_large/demo.py new file mode 100644 index 00000000..d2a62908 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.mobilenet_v3_large.model import MobileNetV3Large + + +def main(is_test: bool = False): + imagenet_demo(MobileNetV3Large, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mobilenet_v3_large/export.py b/qai_hub_models/models/mobilenet_v3_large/export.py new file mode 100644 index 00000000..33d6fce8 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.mobilenet_v3_large import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "mobilenet_v3_large" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "mobilenet_v3_large", + "MobileNet-v3-Large", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mobilenet_v3_large/info.yaml b/qai_hub_models/models/mobilenet_v3_large/info.yaml new file mode 100644 index 00000000..acb36842 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large/info.yaml @@ -0,0 +1,39 @@ +name: MobileNet-v3-Large +# id must match with the model dir name in qai_hub_models +id: mobilenet_v3_large +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: MobileNetV3Large is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone + - real-time +research_paper: https://arxiv.org/abs/1905.02244 +research_paper_title: "Searching for MobileNetV3" +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py +technical_details: + Number of parameters: 5.48M + Model size: 21.1 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/mobilenet_v3_large/model.py b/qai_hub_models/models/mobilenet_v3_large/model.py new file mode 100644 index 00000000..10ce09e7 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class MobileNetV3Large(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.mobilenet_v3_large(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/mobilenet_v3_large/perf.yaml b/qai_hub_models/models/mobilenet_v3_large/perf.yaml new file mode 100644 index 00000000..5907cc30 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: MobileNet-v3-Large + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 600.0 + throughput: 1666.6666666666667 + estimated_peak_memory_range: + min: 32768 + max: 17746392 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 134 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 134 + job_id: j1gly2ee5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:19:38.868341Z' diff --git a/qai_hub_models/models/mobilenet_v3_large/test.py b/qai_hub_models/models/mobilenet_v3_large/test.py new file mode 100644 index 00000000..d9676533 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.mobilenet_v3_large.demo import main as demo_main +from qai_hub_models.models.mobilenet_v3_large.model import MODEL_ID, MobileNetV3Large + + +def test_task(): + run_imagenet_classifier_test(MobileNetV3Large.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(MobileNetV3Large.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/mobilenet_v3_small/README.md b/qai_hub_models/models/mobilenet_v3_small/README.md new file mode 100644 index 00000000..f11cb04d --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_small/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [MobileNet-v3-Small: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/mobilenet_v3_small) + +MobileNetV3Small is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of MobileNet-v3-Small found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/mobilenet_v3_small). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.mobilenet_v3_small.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.mobilenet_v3_small.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of MobileNet-v3-Small can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Searching for MobileNetV3](https://arxiv.org/abs/1905.02244) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py) diff --git a/qai_hub_models/models/mobilenet_v3_small/__init__.py b/qai_hub_models/models/mobilenet_v3_small/__init__.py new file mode 100644 index 00000000..baa9cd9b --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_small/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import MobileNetV3Small as Model # noqa: F401 diff --git a/qai_hub_models/models/mobilenet_v3_small/demo.py b/qai_hub_models/models/mobilenet_v3_small/demo.py new file mode 100644 index 00000000..688e9645 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_small/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.mobilenet_v3_small.model import MobileNetV3Small + + +def main(is_test: bool = False): + imagenet_demo(MobileNetV3Small, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mobilenet_v3_small/export.py b/qai_hub_models/models/mobilenet_v3_small/export.py new file mode 100644 index 00000000..9a7f6e72 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_small/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.mobilenet_v3_small import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "mobilenet_v3_small" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "mobilenet_v3_small", + "MobileNet-v3-Small", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mobilenet_v3_small/info.yaml b/qai_hub_models/models/mobilenet_v3_small/info.yaml new file mode 100644 index 00000000..25782bfc --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_small/info.yaml @@ -0,0 +1,39 @@ +name: MobileNet-v3-Small +# id must match with the model dir name in qai_hub_models +id: mobilenet_v3_small +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +use_case: Image Classification +description: MobileNetV3Small is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +tags: + - backbone + - real-time +research_paper: https://arxiv.org/abs/1905.02244 +research_paper_title: Searching for MobileNetV3 +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py +technical_details: + Number of parameters: 2.54M + Model size: 9.83 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/mobilenet_v3_small/model.py b/qai_hub_models/models/mobilenet_v3_small/model.py new file mode 100644 index 00000000..5dfec747 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_small/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class MobileNetV3Small(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.mobilenet_v3_small(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/mobilenet_v3_small/perf.yaml b/qai_hub_models/models/mobilenet_v3_small/perf.yaml new file mode 100644 index 00000000..65b78de3 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_small/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: MobileNet-v3-Small + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 427.0 + throughput: 2341.92037470726 + estimated_peak_memory_range: + min: 12288 + max: 1724768 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 122 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 122 + job_id: j1gly20e5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:09:16.610887Z' diff --git a/qai_hub_models/models/mobilenet_v3_small/test.py b/qai_hub_models/models/mobilenet_v3_small/test.py new file mode 100644 index 00000000..8f2f64ac --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_small/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.mobilenet_v3_small.demo import main as demo_main +from qai_hub_models.models.mobilenet_v3_small.model import MODEL_ID, MobileNetV3Small + + +def test_task(): + run_imagenet_classifier_test(MobileNetV3Small.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(MobileNetV3Small.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/openai_clip/README.md b/qai_hub_models/models/openai_clip/README.md new file mode 100644 index 00000000..a334800c --- /dev/null +++ b/qai_hub_models/models/openai_clip/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [OpenAI-Clip: Multi-modal foundational model for vision and language tasks like image/text similarity and for zero-shot image classification](https://aihub.qualcomm.com/models/openai_clip) + +Contrastive Language-Image Pre-Training (CLIP) uses a ViT like transformer to get visual features and a causal language model to get the text features. Both the text and visual features can then be used for a variety of zero-shot learning tasks. + +This is based on the implementation of OpenAI-Clip found +[here](https://github.com/openai/CLIP/). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/openai_clip). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[openai_clip]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.openai_clip.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.openai_clip.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of OpenAI-Clip can be found + [here](https://github.com/openai/CLIP/blob/main/LICENSE). + + +## References +* [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) +* [Source Model Implementation](https://github.com/openai/CLIP/) diff --git a/qai_hub_models/models/openai_clip/__init__.py b/qai_hub_models/models/openai_clip/__init__.py new file mode 100644 index 00000000..cea53f71 --- /dev/null +++ b/qai_hub_models/models/openai_clip/__init__.py @@ -0,0 +1,3 @@ +from .app import ClipApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import Clip as Model # noqa: F401 diff --git a/qai_hub_models/models/openai_clip/app.py b/qai_hub_models/models/openai_clip/app.py new file mode 100644 index 00000000..0f0f278d --- /dev/null +++ b/qai_hub_models/models/openai_clip/app.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +from typing import Tuple + +import torch +from PIL.Image import Image + +from qai_hub_models.utils.input_spec import InputSpec + + +class ClipApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with Clip. + + The app uses 1 model: + * Clip + + For a given image input, the app will: + * pre-process the image + * pre-process the text + * Run Clip inference + """ + + def __init__( + self, + clip_model: torch.nn.Module, + ): + # Open AI Clip + self.text_encoder = clip_model.text_encoder + self.image_encoder = clip_model.image_encoder + # Preprocess Compose function from Open AI clip + self.preprocess = clip_model.preprocess + self.tokenizer = clip_model.tokenizer_func + + def predict(self, *args, **kwargs): + # See predict_similarity. + return self.predict_similarity(*args, **kwargs) + + def predict_similarity( + self, image: torch.Tensor, text: torch.Tensor + ) -> torch.Tensor: + """ + Inputs: + image: torch.Tensor (Shape: [1, 3, 224, 224]) + Processed image tensor with values normalized to be between 0-1. + text: torch.Tensor (Shape: [1, 77]) + Processed text tensor to be tokenized. + + Outputs: + logits_per_image: torch.Tensor (Shape: [num_images, num_text_prompts]) + + Given a batch of images and a batch of text tokens, returns a tensor, + containing the logit scores corresponding to each image per text input. + The values are cosine similarities between the corresponding image and + text features, times 100. The logits of text per image can be computed + by doing a transpose. + + """ + with torch.no_grad(): + image_features = self.image_encoder(image) + text_features = self.text_encoder(text) + logits_per_image = image_features @ text_features.t() + return logits_per_image.cpu().numpy() + + def process_image(self, image: Image) -> torch.Tensor: + """Process image before calling forward. + + Inputs: + image: PIL.Image + Image loaded by Pillow must be provided. + Example: image = Image.open('') + + Outputs: + processed_image: torch.Tensor (shape [1, 3, 224, 224]) + Layout: RGB + The image is converted to torch tensor and normalized + to be in the range of 0-1. + """ + return self.preprocess(image).unsqueeze(0) + + def process_text(self, text: str) -> torch.Tensor: + """Process text into tokens for forward call. + + Input: + text: str + Text prompt intended for inference. + Example: "golden hour" + + Output: + tokenized_tensor: torch.Tensor (shape: [1, 77]) + Example: tensor([[49406, 3878, 2232, 49407, 0, 0...]]) + + """ + return self.tokenizer(text) + + def get_input_spec( + self, + image_size: Tuple[int, int] = (224, 224), + text_size: Tuple[int, int] = (3, 77), + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + if isinstance(image_size, int): + image_size = (image_size, image_size) + return { + "image": ((1, 3, *image_size), "float32"), + "text": (text_size, "int32"), + } diff --git a/qai_hub_models/models/openai_clip/demo.py b/qai_hub_models/models/openai_clip/demo.py new file mode 100644 index 00000000..3001a5c1 --- /dev/null +++ b/qai_hub_models/models/openai_clip/demo.py @@ -0,0 +1,94 @@ +import argparse +import os + +import numpy as np +import torch + +from qai_hub_models.models.openai_clip.app import ClipApp +from qai_hub_models.models.openai_clip.model import MODEL_ASSET_VERSION, MODEL_ID, Clip +from qai_hub_models.utils.args import add_output_dir_arg +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + + +# Run Clip on a directory of images with a query text. +# The demo will display similarity score for each image. +def main(is_test: bool = False): + # Demo parameters + parser = argparse.ArgumentParser() + parser.add_argument( + "--image-dir", + type=str, + default=None, + help="Path to image directory", + ) + parser.add_argument( + "--image-names", + type=str, + default="image1.jpg,image2.jpg,image3.jpg", + help="Specify names of the images in the folder.", + ) + parser.add_argument( + "--text", + type=str, + default="camping under the stars", + help="Text prompt for image search", + ) + add_output_dir_arg(parser) + args = parser.parse_args([] if is_test else None) + + # Load model + clip_model = Clip.from_pretrained() + app = ClipApp(clip_model=clip_model) + + image_names = args.image_names.split(",") + text = app.process_text(args.text) + images = [] + + # Iterate through images and text provided by user + for filename in image_names: + # Make sure the file is an image + if os.path.splitext(filename)[1].lower() in [".jpg", ".jpeg", ".png"]: + if not args.image_dir: + image = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, filename + ) + else: + image = os.path.join(args.image_dir, filename) + # Preprocess image and text pair + image = app.process_image(load_image(image)) + images.append(image) + + else: + print(f"Skipping file {filename}") + + images = torch.stack(images).squeeze(1) + + # Compute similarity + predictions = app.predict_similarity(images, text).flatten() + + # Display all the images and their score wrt to the text prompt provided. + print(f"Searching images by prompt: {args.text}") + for i in range(len(predictions)): + print( + f"\t Image with name: {image_names[i]} has a similarity score={predictions[i]}" + ) + + # Show image + print("Displaying the most relevant image") + + selected_image = image_names[np.argmax(predictions)] + if not args.image_dir: + selected_image = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, selected_image + ) + else: + selected_image = os.path.join(args.image_dir, selected_image) + most_relevant_image = load_image(selected_image) + + if not is_test: + display_or_save_image(most_relevant_image, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/openai_clip/export.py b/qai_hub_models/models/openai_clip/export.py new file mode 100644 index 00000000..5b5ab493 --- /dev/null +++ b/qai_hub_models/models/openai_clip/export.py @@ -0,0 +1,215 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Mapping, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.openai_clip import Model +from qai_hub_models.utils.args import ( + export_parser, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + +ALL_COMPONENTS = ["CLIPTextEncoder", "CLIPImageEncoder"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[ + str, Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] +] | List[str]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` + + Returns: + A Mapping from component_name to a 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "openai_clip" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + component_arg = components + components = components or ALL_COMPONENTS + for component in components: + if component not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "openai_clip", + "OpenAI-Clip", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + component_arg, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + components_dict = {} + if "CLIPTextEncoder" in components: + components_dict["CLIPTextEncoder"] = model.text_encoder + if "CLIPImageEncoder" in components: + components_dict["CLIPImageEncoder"] = model.image_encoder + + compile_jobs = {} + for component_name, component in components_dict.items(): + # Trace the model + input_spec = component.get_input_spec() + source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + + # 2. Compile the models to an on-device asset + model_compile_options = component.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {component_name} to run on-device.") + compile_jobs[component_name] = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=f"{component_name}", + options=model_compile_options, + ) + + # 3. Profile the model assets on real devices + profile_jobs = {} + if not skip_profiling: + for component_name in components: + print(f"Profiling model {component_name} on a hosted device.") + profile_jobs[component_name] = hub.submit_profile_job( + model=compile_jobs[component_name].get_target_model(), + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_jobs = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + sample_inputs = components_dict[component_name].sample_inputs() + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_jobs[component_name] = hub.submit_inference_job( + model=compile_jobs[component_name].get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 5. Download the model assets to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + for component_name, compile_job in compile_jobs.items(): + target_model = compile_job.get_target_model() + target_model.download( + str(output_path / f"{model_name}_{component_name}.tflite") + ) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + for component_name in components: + inference_job = inference_jobs[component_name] + sample_inputs = components_dict[component_name].sample_inputs() + torch_out = torch_inference(components_dict[component_name], sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return { + component_name: ( + compile_jobs[component_name], + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/openai_clip/info.yaml b/qai_hub_models/models/openai_clip/info.yaml new file mode 100644 index 00000000..ceddd3b4 --- /dev/null +++ b/qai_hub_models/models/openai_clip/info.yaml @@ -0,0 +1,38 @@ +name: OpenAI-Clip +# id must match with the model dir name in qai_hub_models +id: openai_clip +status: public +headline: Multi-modal foundational model for vision and language tasks like + image/text similarity and for zero-shot image classification. +domain: Multimodal +description: Contrastive Language-Image Pre-Training (CLIP) uses a ViT like + transformer to get visual features and a causal language model to get the text + features. Both the text and visual features can then be used for a variety of + zero-shot learning tasks. +use_case: Image Classification +tags: + - foundation +research_paper: https://arxiv.org/abs/2103.00020 +research_paper_title: Learning Transferable Visual Models From Natural Language Supervision +license: https://github.com/openai/CLIP/blob/main/LICENSE +source_repo: https://github.com/openai/CLIP/ +technical_details: + Text Number of parameters: 150M + Image Number of parameters: 150M + Text Model size: 250 MB + Image Model size: 350 MB + Model checkpoint: ViT-B/16 + Image input resolution: 224x224 + Text context length: 77 +applicable_scenarios: + - Image Search + - Content Moderation + - Caption Creation +related_models: [] +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: mit +dataset: [] diff --git a/qai_hub_models/models/openai_clip/model.py b/qai_hub_models/models/openai_clip/model.py new file mode 100644 index 00000000..da38dd39 --- /dev/null +++ b/qai_hub_models/models/openai_clip/model.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +from typing import Callable + +import torch +import torchvision + +from qai_hub_models.utils.asset_loaders import SourceAsRoot, callback_with_retry +from qai_hub_models.utils.base_model import BaseModel, CollectionModel +from qai_hub_models.utils.input_spec import InputSpec + +PRETRAINED_WEIGHTS = "ViT-B/16" +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +OPENAI_CLIP_SOURCE_REPOSITORY = "https://github.com/openai/CLIP" +OPENAI_CLIP_SOURCE_REPO_COMMIT = "a1d071733d7111c9c014f024669f959182114e33" + + +def load_clip_and_tokenizer(): + """Downloading pretrained weights via OpenAI and loading them.""" + with SourceAsRoot( + OPENAI_CLIP_SOURCE_REPOSITORY, + OPENAI_CLIP_SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + ): + import clip + + tokenizer_func = clip.tokenize + net, preprocess = clip.load(PRETRAINED_WEIGHTS) + return net, preprocess, tokenizer_func + + +class Clip(CollectionModel): + def __init__( + self, + text_encoder: torch.nn.Module, + image_encoder: torch.nn.Module, + preprocess: torchvision.transforms.transforms.Compose, + tokenizer_func: Callable, + ): + super().__init__() + self.text_encoder = text_encoder + self.image_encoder = image_encoder + self.preprocess = preprocess + self.tokenizer_func = tokenizer_func + + @staticmethod + def from_pretrained(): + net, preprocess, tokenizer_func = callback_with_retry( + num_retries=5, callback=load_clip_and_tokenizer + ) + return Clip.from_source_model(net, preprocess, tokenizer_func) + + @staticmethod + def from_source_model(net, preprocess, tokenizer_func): + net = net.eval() + text_encoder = ClipTextEncoder(net) + image_encoder = ClipImageEncoder(net) + return Clip(text_encoder, image_encoder, preprocess, tokenizer_func) + + +class ClipTextEncoder(BaseModel): + def __init__(self, net: torch.nn.Module): + super().__init__() + """ Wrapper for OpenAI CLIP.""" + self.net = net + self.eot_token = 49407 + + def forward(self, text: torch.Tensor): + """Forward call on Open AI CLIP model. + + Inputs: + text: torch.Tensor (Shape: [1, 77] context_length=77) + Processed text tensor to be tokenized. + + Outputs: + text_features: torch.Tensor [512 (transformer_width), num_text_prompts] + Raw text features are returned. When multiplied to image features, + you can obtain a matrix of cosine similarities between the + corresponding image and text input. + + """ + clipped_text = torch.clip(text, min=0, max=self.eot_token) + text_features = self.net.encode_text(clipped_text) + text_features = text_features / text_features.norm(dim=1, keepdim=True) + return text_features + + def get_input_spec( + self, + batch_size: int = 1, + text_length: int = 77, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return { + "text": ((batch_size, text_length), "int32"), + } + + @classmethod + def from_pretrained(cls): + return Clip.from_pretrained().text_encoder + + +class ClipImageEncoder(BaseModel): + def __init__(self, net: torch.nn.Module): + super().__init__() + """ Wrapper for OpenAI Clip.""" + self.net = net + self.eot_token = 49407 + + def forward(self, image: torch.Tensor): + """Forward call on Open AI Clip model. + + Inputs: + image: torch.Tensor (Shape: [1, 3, 224, 224]) + Processed image tensor with values normalized to be between 0-1. + Channel Layout: RGB + + Outputs: + image_features: torch.Tensor [num_images, 512 (transformer_width)] + Raw image features (multiplied to 100) are returned. + When multiplied to text features, you can obtain a + matrix of cosine similarities between the corresponding image and + text input. + + """ + image_features = self.net.encode_image(image) + image_features = image_features / image_features.norm(dim=1, keepdim=True) + return self.net.logit_scale.exp() * image_features + + def get_input_spec( + self, + height: int = 224, + width: int = 224, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return { + "image": ((1, 3, height, width), "float32"), + } + + @classmethod + def from_pretrained(cls): + return Clip.from_pretrained().image_encoder diff --git a/qai_hub_models/models/openai_clip/perf.yaml b/qai_hub_models/models/openai_clip/perf.yaml new file mode 100644 index 00000000..2a6ddaed --- /dev/null +++ b/qai_hub_models/models/openai_clip/perf.yaml @@ -0,0 +1,107 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: CLIPTextEncoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 15528.0 + throughput: 64.39979392065945 + estimated_peak_memory_range: + min: 40960 + max: 3106072 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 574 + layers_on_gpu: 0 + layers_on_cpu: 2 + total_layers: 576 + job_id: j2p0m2veg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 8149.0 + throughput: 122.71444348999877 + estimated_peak_memory_range: + min: 40960 + max: 23728064 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 377 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 377 + job_id: jogk2q9og + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:25:08.294036Z' +- name: CLIPImageEncoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 127729.0 + throughput: 7.829075621041424 + estimated_peak_memory_range: + min: 159744 + max: 3867320 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 575 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 575 + job_id: j1p8em48p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 50903.0 + throughput: 19.645207551617784 + estimated_peak_memory_range: + min: 86016 + max: 59741752 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 370 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 370 + job_id: jn5qlrmmp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:30:00.084732Z' diff --git a/qai_hub_models/models/openai_clip/requirements.txt b/qai_hub_models/models/openai_clip/requirements.txt new file mode 100644 index 00000000..4245d2f9 --- /dev/null +++ b/qai_hub_models/models/openai_clip/requirements.txt @@ -0,0 +1,3 @@ +torchvision +ftfy==6.1.1 +regex==2023.10.3 diff --git a/qai_hub_models/models/openai_clip/test.py b/qai_hub_models/models/openai_clip/test.py new file mode 100644 index 00000000..7786e09f --- /dev/null +++ b/qai_hub_models/models/openai_clip/test.py @@ -0,0 +1,48 @@ +import numpy as np + +from qai_hub_models.models.openai_clip.app import ClipApp +from qai_hub_models.models.openai_clip.demo import main as demo_main +from qai_hub_models.models.openai_clip.model import MODEL_ASSET_VERSION, MODEL_ID, Clip +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "image1.jpg" +) +TEXT = "pyramid in desert" + + +@skip_clone_repo_check +def test_prediction(): + """Verify our driver produces the correct score given image and text pair.""" + source_clip_model = Clip.from_pretrained() + clip_app = ClipApp(source_clip_model) + processed_sample_image = clip_app.process_image(load_image(IMAGE_ADDRESS)) + processed_sample_text = clip_app.process_text(TEXT) + assert clip_app.predict_similarity(processed_sample_image, processed_sample_text) + + +@skip_clone_repo_check +def test_task(): + """Verify that raw (numeric) outputs of both networks are the same.""" + source_clip_model = Clip.from_pretrained() + clip_app = ClipApp(source_clip_model) + processed_sample_image = clip_app.process_image(load_image(IMAGE_ADDRESS)) + processed_sample_text = clip_app.process_text(TEXT) + source_clip_text_model, source_clip_image_model = ( + source_clip_model.text_encoder, + source_clip_model.image_encoder, + ) + text_features = source_clip_text_model(processed_sample_text) + image_features = source_clip_image_model(processed_sample_image) + source_out = image_features @ text_features.t() + qaihm_out = clip_app.predict_similarity( + processed_sample_image, processed_sample_text + ) + + assert np.allclose(source_out.detach().numpy(), qaihm_out) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/openpose/README.md b/qai_hub_models/models/openpose/README.md new file mode 100644 index 00000000..1ae81491 --- /dev/null +++ b/qai_hub_models/models/openpose/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [OpenPose: Human pose estimation](https://aihub.qualcomm.com/models/openpose) + +OpenPose is a machine learning model that estimates body and hand pose in an image and returns location and confidence for each of 19 joints. + +This is based on the implementation of OpenPose found +[here](https://github.com/CMU-Perceptual-Computing-Lab/openpose). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/openpose). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[openpose]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.openpose.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.openpose.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of OpenPose can be found + [here](https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/LICENSE). + + +## References +* [OpenPose: Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields](https://arxiv.org/abs/1812.08008) +* [Source Model Implementation](https://github.com/CMU-Perceptual-Computing-Lab/openpose) diff --git a/qai_hub_models/models/openpose/__init__.py b/qai_hub_models/models/openpose/__init__.py new file mode 100644 index 00000000..ff08f6f9 --- /dev/null +++ b/qai_hub_models/models/openpose/__init__.py @@ -0,0 +1,3 @@ +from .app import OpenPoseApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import OpenPose as Model # noqa: F401 diff --git a/qai_hub_models/models/openpose/app.py b/qai_hub_models/models/openpose/app.py new file mode 100644 index 00000000..777e682d --- /dev/null +++ b/qai_hub_models/models/openpose/app.py @@ -0,0 +1,346 @@ +from __future__ import annotations + +import math +from typing import Tuple + +import numpy as np +import PIL +import torch +import torch.nn.functional as F +from PIL.Image import Image +from scipy.ndimage.filters import gaussian_filter + +from qai_hub_models.utils.image_processing import preprocess_PIL_image + + +class OpenPoseApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with OpenPose. + + The app uses 1 model: + * OpenPose + + For a given image input, the app will: + * Run OpenPose inference on the image + * display the output keypoints drawn over the input image + """ + + def __init__(self, openpose_model): + self.model = openpose_model + + def predict(self, *args, **kwargs): + # See estimate_pose. + return self.estimate_pose(*args, **kwargs) + + def estimate_pose( + self, + image: Image, + ) -> Image: + """ + Perform pose estimate on provided images + + Parameters: + image: Input PIL image + + Returns: + keypoints: List[] + A list of keypoints of joints in the image + """ + + # preprocess + pixel_values = preprocess_PIL_image(image) + + # Run prediction + paf, heatmap = self.model(pixel_values) + + # post process heatmaps and paf to get keypoints + keypoints, subset = getKeypointsFromPredictions( + paf, heatmap, pixel_values.shape[2], pixel_values.shape[3] + ) + + output_image = draw_keypoints(image, keypoints, radius=4, alpha=0.8) + + return output_image + + +def getKeypointsFromPredictions( + paf: torch.Tensor, heatmap: torch.Tensor, h, w +) -> Tuple(np.ndarray, np.ndarray): + # upsample the PAF and heatmap to be the same size as the original image + target_size = (h, w) + upsampled_paf = ( + F.interpolate(paf, size=target_size, mode="bicubic", align_corners=False) + .detach() + .numpy() + ) + heatmap = ( + F.interpolate(heatmap, size=target_size, mode="bicubic", align_corners=False) + .detach() + .numpy() + ) + + # reshape for post processing + heatmap = np.transpose(heatmap.squeeze(), (1, 2, 0)) + paf = np.transpose(upsampled_paf.squeeze(), (1, 2, 0)) + + """ + The following post-processing code comes from the pytorch openpose repo, at + https://github.com/Hzzone/pytorch-openpose/blob/5ee71dc10020403dc3def2bb68f9b77c40337ae2/src/body.py#L67C9-L67C9 + """ + + all_peaks = [] + peak_counter = 0 + thre1 = 0.1 + thre2 = 0.05 + + for part in range(18): + map_ori = heatmap[:, :, part] + one_heatmap = gaussian_filter(map_ori, sigma=3) + + map_left = np.zeros(one_heatmap.shape) + map_left[1:, :] = one_heatmap[:-1, :] + map_right = np.zeros(one_heatmap.shape) + map_right[:-1, :] = one_heatmap[1:, :] + map_up = np.zeros(one_heatmap.shape) + map_up[:, 1:] = one_heatmap[:, :-1] + map_down = np.zeros(one_heatmap.shape) + map_down[:, :-1] = one_heatmap[:, 1:] + + peaks_binary = np.logical_and.reduce( + ( + one_heatmap >= map_left, + one_heatmap >= map_right, + one_heatmap >= map_up, + one_heatmap >= map_down, + one_heatmap > thre1, + ) + ) + peaks = list( + zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0]) + ) # note reverse + peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks] + peak_id = range(peak_counter, peak_counter + len(peaks)) + peaks_with_score_and_id = [ + peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id)) + ] + + all_peaks.append(peaks_with_score_and_id) + peak_counter += len(peaks) + + # find connection in the specified sequence, center 29 is in the position 15 + limbSeq = [ + [2, 3], + [2, 6], + [3, 4], + [4, 5], + [6, 7], + [7, 8], + [2, 9], + [9, 10], + [10, 11], + [2, 12], + [12, 13], + [13, 14], + [2, 1], + [1, 15], + [15, 17], + [1, 16], + [16, 18], + [3, 17], + [6, 18], + ] + # the middle joints heatmap correpondence + mapIdx = [ + [31, 32], + [39, 40], + [33, 34], + [35, 36], + [41, 42], + [43, 44], + [19, 20], + [21, 22], + [23, 24], + [25, 26], + [27, 28], + [29, 30], + [47, 48], + [49, 50], + [53, 54], + [51, 52], + [55, 56], + [37, 38], + [45, 46], + ] + + connection_all = [] + special_k = [] + mid_num = 10 + + for k in range(len(mapIdx)): + score_mid = paf[:, :, [x - 19 for x in mapIdx[k]]] + candA = all_peaks[limbSeq[k][0] - 1] + candB = all_peaks[limbSeq[k][1] - 1] + nA = len(candA) + nB = len(candB) + indexA, indexB = limbSeq[k] + if nA != 0 and nB != 0: + connection_candidate = [] + for i in range(nA): + for j in range(nB): + vec = np.subtract(candB[j][:2], candA[i][:2]) + norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1]) + norm = max(0.001, norm) + vec = np.divide(vec, norm) + + startend = list( + zip( + np.linspace(candA[i][0], candB[j][0], num=mid_num), + np.linspace(candA[i][1], candB[j][1], num=mid_num), + ) + ) + + vec_x = np.array( + [ + score_mid[ + int(round(startend[index][1])), + int(round(startend[index][0])), + 0, + ] + for index in range(len(startend)) + ] + ) + vec_y = np.array( + [ + score_mid[ + int(round(startend[index][1])), + int(round(startend[index][0])), + 1, + ] + for index in range(len(startend)) + ] + ) + + score_midpts = np.multiply(vec_x, vec[0]) + np.multiply( + vec_y, vec[1] + ) + score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min( + 0.5 * h / norm - 1, 0 + ) + criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len( + score_midpts + ) + criterion2 = score_with_dist_prior > 0 + if criterion1 and criterion2: + connection_candidate.append( + [ + i, + j, + score_with_dist_prior, + score_with_dist_prior + candA[i][2] + candB[j][2], + ] + ) + + connection_candidate = sorted( + connection_candidate, key=lambda x: x[2], reverse=True + ) + connection = np.zeros((0, 5)) + for c in range(len(connection_candidate)): + i, j, s = connection_candidate[c][0:3] + if i not in connection[:, 3] and j not in connection[:, 4]: + connection = np.vstack( + [connection, [candA[i][3], candB[j][3], s, i, j]] + ) + if len(connection) >= min(nA, nB): + break + + connection_all.append(connection) + else: + special_k.append(k) + connection_all.append([]) + + # last number in each row is the total parts number of that person + # the second last number in each row is the score of the overall configuration + subset = -1 * np.ones((0, 20)) + candidate = np.array([item for sublist in all_peaks for item in sublist]) + + for k in range(len(mapIdx)): + if k not in special_k: + partAs = connection_all[k][:, 0] + partBs = connection_all[k][:, 1] + indexA, indexB = np.array(limbSeq[k]) - 1 + + for i in range(len(connection_all[k])): # = 1:size(temp,1) + found = 0 + subset_idx = [-1, -1] + for j in range(len(subset)): # 1:size(subset,1): + if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]: + subset_idx[found] = j + found += 1 + + if found == 1: + j = subset_idx[0] + if subset[j][indexB] != partBs[i]: + subset[j][indexB] = partBs[i] + subset[j][-1] += 1 + subset[j][-2] += ( + candidate[partBs[i].astype(int), 2] + + connection_all[k][i][2] + ) + elif found == 2: # if found 2 and disjoint, merge them + j1, j2 = subset_idx + membership = ( + (subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int) + )[:-2] + if len(np.nonzero(membership == 2)[0]) == 0: # merge + subset[j1][:-2] += subset[j2][:-2] + 1 + subset[j1][-2:] += subset[j2][-2:] + subset[j1][-2] += connection_all[k][i][2] + subset = np.delete(subset, j2, 0) + else: # as like found == 1 + subset[j1][indexB] = partBs[i] + subset[j1][-1] += 1 + subset[j1][-2] += ( + candidate[partBs[i].astype(int), 2] + + connection_all[k][i][2] + ) + + # if find no partA in the subset, create a new subset + elif not found and k < 17: + row = -1 * np.ones(20) + row[indexA] = partAs[i] + row[indexB] = partBs[i] + row[-1] = 2 + row[-2] = ( + sum(candidate[connection_all[k][i, :2].astype(int), 2]) + + connection_all[k][i][2] + ) + subset = np.vstack([subset, row]) + # delete some rows of subset which has few parts occur + deleteIdx = [] + for i in range(len(subset)): + if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4: + deleteIdx.append(i) + subset = np.delete(subset, deleteIdx, axis=0) + + # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts + # candidate: x, y, score, id + return candidate, subset + + +def draw_keypoints(image: Image, keypoints: np.ndarray, radius=1, alpha=1.0): + overlay = image.copy() + draw = PIL.ImageDraw.Draw(overlay) + confidence_threshold = 0.8 + for kp in keypoints: + x, y, v, i = kp + if v > confidence_threshold: + draw.ellipse( + ( + (int(x - radius), int(y - radius)), + (int(x + radius), int(y + radius)), + ), + outline=(0, 255, 0), + fill=(0, 255, 0), + ) + + return PIL.Image.blend(overlay, image, alpha) diff --git a/qai_hub_models/models/openpose/demo.py b/qai_hub_models/models/openpose/demo.py new file mode 100644 index 00000000..257823db --- /dev/null +++ b/qai_hub_models/models/openpose/demo.py @@ -0,0 +1,38 @@ +import argparse + +from qai_hub_models.models.openpose.app import OpenPoseApp +from qai_hub_models.models.openpose.model import MODEL_ASSET_VERSION, MODEL_ID, OpenPose +from qai_hub_models.utils.args import add_output_dir_arg +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "openpose_demo.png" +) + + +# Run OpenPose end-to-end on a sample image. +# The demo will display the input image with circles drawn over the estimated joint positions. +def main(is_test: bool = False): + # Demo parameters + parser = argparse.ArgumentParser() + parser.add_argument( + "--image", + type=str, + default=IMAGE_ADDRESS, + help="image file path or URL.", + ) + add_output_dir_arg(parser) + + args = parser.parse_args([] if is_test else None) + + # Load image & model + app = OpenPoseApp(OpenPose.from_pretrained()) + image = load_image(args.image) + pred_image = app.estimate_pose(image) + if not is_test: + display_or_save_image(pred_image, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/openpose/export.py b/qai_hub_models/models/openpose/export.py new file mode 100644 index 00000000..d64c5eed --- /dev/null +++ b/qai_hub_models/models/openpose/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.openpose import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "openpose" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "openpose", + "OpenPose", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0,output_1", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0,output_1", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/openpose/info.yaml b/qai_hub_models/models/openpose/info.yaml new file mode 100644 index 00000000..5faeab8a --- /dev/null +++ b/qai_hub_models/models/openpose/info.yaml @@ -0,0 +1,33 @@ +name: OpenPose +# id must match with the model dir name in qai_hub_models +id: openpose +status: public +headline: Human pose estimation. +domain: Computer Vision +description: OpenPose is a machine learning model that estimates body and hand pose in an image and returns location and confidence for each of 19 joints. +use_case: Pose Estimation +tags: [] +research_paper: https://arxiv.org/abs/1812.08008 +research_paper_title: "OpenPose: Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields" +license: https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/LICENSE +source_repo: https://github.com/CMU-Perceptual-Computing-Lab/openpose +technical_details: + Number of parameters: 52M + Model size: 199.5 MB + Model checkpoint: body_pose_model.pth + Input resolution: 240x320 +applicable_scenarios: + - Injury prevention training + - Sports performance analysis + - Posture recognition +form_factors: + - Phone + - Tablet + - IoT +related_models: + - 'litehrnet' + - 'mediapipe_pose' +has_static_banner: yes +has_animated_banner: no +license_type: other +dataset: [] diff --git a/qai_hub_models/models/openpose/model.py b/qai_hub_models/models/openpose/model.py new file mode 100644 index 00000000..b716377a --- /dev/null +++ b/qai_hub_models/models/openpose/model.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from typing import Tuple + +import torch + +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, SourceAsRoot +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +OPENPOSE_SOURCE_REPOSITORY = "https://github.com/CMU-Perceptual-Computing-Lab/openpose" +OPENPOSE_SOURCE_REPO_COMMIT = "80d4c5f7b25ba4c3bf5745ab7d0e6ccd3db8b242" +OPENPOSE_PROXY_REPOSITORY = "https://github.com/Hzzone/pytorch-openpose" +OPENPOSE_PROXY_REPO_COMMIT = "5ee71dc10020403dc3def2bb68f9b77c40337ae2" +# Originally from https://drive.google.com/file/d/1EULkcH_hhSU28qVc1jSJpCh2hGOrzpjK/view +DEFAULT_WEIGHTS = "body_pose_model.pth" +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 + + +class OpenPose(BaseModel): + """Exportable OpenPose pose estimation""" + + def __init__( + self, + openpose_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = openpose_model + + @classmethod + def from_pretrained(cls, weights_path: str | None = None) -> OpenPose: + """Load OpenPose from a weightfile created by the source OpenPose repository.""" + + # Load PyTorch model from disk + openpose_model = _load_openpose_source_model_from_weights(weights_path) + + return cls(openpose_model) + + def forward(self, image: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Run OpenPose on `image`, and produce keypoints for pose estimation + + Parameters: + image: Pixel values for model consumption. + Range: float32[0-1] + 3-channel Color Space: RGB + Shape: 1xCxHxW + + Returns: + PAF: 1x38xH/8xW/8 (2x number of joints) + Range: float[0, 1] + 2-dimensional relations between different indices that represent body parts + heatmap: 1x19xH/8xW/8 (i value per joint per pixel) + Range: float[0, 1] + 2 dimensional heatmaps representing probabilities for each joint across the image + + The output width and height are downsampled from the input width and height by a factor of 8. + """ + + img_padded = image.squeeze().permute(1, 2, 0) + h = img_padded.shape[0] + w = img_padded.shape[1] + padValue = 128 + stride = 8 + pad = [ + 0, + 0, + 0 if (h % stride == 0) else stride - (h % stride), + 0 if (w % stride == 0) else stride - (w % stride), + ] + # Pad up + pad_up = torch.full((pad[0], w, 3), padValue, dtype=img_padded.dtype) + img_padded = torch.cat((pad_up, img_padded), dim=0) + + # Pad left + pad_left = torch.full((h, pad[1], 3), padValue, dtype=img_padded.dtype) + img_padded = torch.cat((pad_left, img_padded), dim=1) + + # Pad down + pad_down = torch.full((pad[2], w, 3), padValue, dtype=img_padded.dtype) + img_padded = torch.cat((img_padded, pad_down), dim=0) + + # Pad right + pad_right = torch.full((h, pad[3], 3), padValue, dtype=img_padded.dtype) + img_padded = torch.cat((img_padded, pad_right), dim=1) + + # reshape + im = img_padded.permute(2, 0, 1).unsqueeze(0) - 0.5 + + # Run the model + with torch.no_grad(): + paf, heatmap = self.model(im) + + return paf, heatmap + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 224, + width: int = 224, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} + + +def _load_openpose_source_model_from_weights( + weights_path_body: str | None = None, +) -> torch.nn.Module: + # Load OpenPose model from the source repository using the given weights. + + # OpenPose exists as a Caffe model or Windows binaries in the original repository. + # The proxy repository contains a pytorch implementation, converted from the caffe model + with SourceAsRoot( + OPENPOSE_PROXY_REPOSITORY, + OPENPOSE_PROXY_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + ): + # download the weights file + if not weights_path_body: + weights_path_body = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_WEIGHTS + ).fetch() + + # Import model files from pytorch openpose repo + from src.body import Body + + body_estimation = Body(weights_path_body) + + return body_estimation.model.eval() diff --git a/qai_hub_models/models/openpose/perf.yaml b/qai_hub_models/models/openpose/perf.yaml new file mode 100644 index 00000000..29999a3f --- /dev/null +++ b/qai_hub_models/models/openpose/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: OpenPose + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 11747.0 + throughput: 85.12811781731506 + estimated_peak_memory_range: + min: 229376 + max: 2462464 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 103 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 103 + job_id: jnp1nw3kg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 11820.0 + throughput: 84.60236886632826 + estimated_peak_memory_range: + min: 622592 + max: 241891488 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 187 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 187 + job_id: jvgddq0kg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:07:34.029953Z' diff --git a/qai_hub_models/models/openpose/requirements.txt b/qai_hub_models/models/openpose/requirements.txt new file mode 100644 index 00000000..ab4e628e --- /dev/null +++ b/qai_hub_models/models/openpose/requirements.txt @@ -0,0 +1,2 @@ +scipy +matplotlib diff --git a/qai_hub_models/models/openpose/test.py b/qai_hub_models/models/openpose/test.py new file mode 100644 index 00000000..524321f4 --- /dev/null +++ b/qai_hub_models/models/openpose/test.py @@ -0,0 +1,31 @@ +import numpy as np + +from qai_hub_models.models.openpose.app import OpenPoseApp +from qai_hub_models.models.openpose.demo import IMAGE_ADDRESS +from qai_hub_models.models.openpose.demo import main as demo_main +from qai_hub_models.models.openpose.model import MODEL_ASSET_VERSION, MODEL_ID, OpenPose +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "openpose_output.png" +) + + +@skip_clone_repo_check +def test_openpose_app(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = OpenPoseApp(OpenPose.from_pretrained()) + app_output_image = app.estimate_pose(image) + np.testing.assert_allclose( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/quicksrnetlarge/README.md b/qai_hub_models/models/quicksrnetlarge/README.md new file mode 100644 index 00000000..31d3723e --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [QuickSRNetLarge: Upscale images and remove image noise](https://aihub.qualcomm.com/models/quicksrnetlarge) + +QuickSRNet Large is designed for upscaling images on mobile platforms to sharpen in real-time. + +This is based on the implementation of QuickSRNetLarge found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/quicksrnetlarge). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.quicksrnetlarge.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.quicksrnetlarge.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of QuickSRNetLarge can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). + + +## References +* [QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms](https://arxiv.org/abs/2303.04336) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet) diff --git a/qai_hub_models/models/quicksrnetlarge/__init__.py b/qai_hub_models/models/quicksrnetlarge/__init__.py new file mode 100644 index 00000000..b642d01f --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import QuickSRNetLarge as Model # noqa: F401 diff --git a/qai_hub_models/models/quicksrnetlarge/demo.py b/qai_hub_models/models/quicksrnetlarge/demo.py new file mode 100644 index 00000000..86e74f15 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge/demo.py @@ -0,0 +1,25 @@ +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.quicksrnetlarge.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + QuickSRNetLarge, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "quicksrnet_demo.jpg" +) + + +# Run QuickSRNet end-to-end on a sample image. +# The demo will display an upscaled image +def main(is_test: bool = False): + super_resolution_demo( + model_cls=QuickSRNetLarge, + default_image=IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/quicksrnetlarge/export.py b/qai_hub_models/models/quicksrnetlarge/export.py new file mode 100644 index 00000000..9749a6a6 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.quicksrnetlarge import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "quicksrnetlarge" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "quicksrnetlarge", + "QuickSRNetLarge", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/quicksrnetlarge/info.yaml b/qai_hub_models/models/quicksrnetlarge/info.yaml new file mode 100644 index 00000000..a3672425 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge/info.yaml @@ -0,0 +1,30 @@ +name: QuickSRNetLarge +# id must match with the model dir name in qai_hub_models +id: quicksrnetlarge +status: public +headline: Upscale images and remove image noise. +domain: Computer Vision +description: QuickSRNet Large is designed for upscaling images on mobile platforms to sharpen in real-time. +use_case: Super Resolution +tags: [] +research_paper: https://arxiv.org/abs/2303.04336 +research_paper_title: "QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms" +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet +technical_details: + Number of parameters: 436K + Model size: 1.72 MB + Model checkpoint: quicksrnet_large_4x_checkpoint_float32 + Input resolution: 128x128 +applicable_scenarios: + - Virtual Real Estate Tours + - Gaming + - ARVR +form_factors: + - Phone + - Tablet +related_models: ['xlsr', 'esrgan', 'quicksrnetlarge_quantized'] +has_static_banner: yes +has_animated_banner: yes +license_type: other +dataset: [] diff --git a/qai_hub_models/models/quicksrnetlarge/model.py b/qai_hub_models/models/quicksrnetlarge/model.py new file mode 100644 index 00000000..b531ade8 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge/model.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.evaluators.superres_evaluator import SuperResolutionOutputEvaluator +from qai_hub_models.models._shared.quicksrnet.common import ( + _load_quicksrnet_source_model, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_large_4x_w8a8.json +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_large_4x_checkpoint_float32.pth.tar +QUICKSRNET_WEIGHTS = "quicksrnet_large_4x_checkpoint_float32.pth.tar" +SCALING_FACTOR = 4 +NUM_CHANNELS = 64 +NUM_INTERMEDIATE_LAYERS = 11 +USE_ITO_CONNECTION = True + + +class QuickSRNetLarge(BaseModel): + """Exportable QuickSRNet-Large upscaler, end-to-end.""" + + def __init__( + self, + quicksrnet_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = quicksrnet_model + + @classmethod + def from_pretrained(cls) -> QuickSRNetLarge: + model = _load_quicksrnet_source_model( + MODEL_ID, + MODEL_ASSET_VERSION, + SCALING_FACTOR, + NUM_CHANNELS, + NUM_INTERMEDIATE_LAYERS, + USE_ITO_CONNECTION, + ) + dst = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, QUICKSRNET_WEIGHTS + ).fetch() + checkpoint = torch.load(dst, map_location=torch.device("cpu")) + model.load_state_dict(checkpoint["state_dict"]) + model.eval() + + return cls(model) + + def get_evaluator(self) -> BaseEvaluator: + return SuperResolutionOutputEvaluator() + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run QuickSRNet-Large on `image`, and produce an upscaled image + + Parameters: + image: Pixel values pre-processed for model consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + image: Pixel values + Range: float[0, 1] + 3-channel Color Space: RGB + """ + + return self.model(image) + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 128, + width: int = 128, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} diff --git a/qai_hub_models/models/quicksrnetlarge/perf.yaml b/qai_hub_models/models/quicksrnetlarge/perf.yaml new file mode 100644 index 00000000..1b0102ea --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: QuickSRNetLarge + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 2532.0 + throughput: 394.9447077409163 + estimated_peak_memory_range: + min: 16384 + max: 8035880 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 28 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 31 + job_id: jz57el4rp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 2106.0 + throughput: 474.8338081671415 + estimated_peak_memory_range: + min: 212992 + max: 76319976 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 32 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 32 + job_id: jqp4yd1lp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:38:01.534196Z' diff --git a/qai_hub_models/models/quicksrnetlarge/test.py b/qai_hub_models/models/quicksrnetlarge/test.py new file mode 100644 index 00000000..cd848301 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge/test.py @@ -0,0 +1,35 @@ +import numpy as np + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.quicksrnetlarge.demo import IMAGE_ADDRESS +from qai_hub_models.models.quicksrnetlarge.demo import main as demo_main +from qai_hub_models.models.quicksrnetlarge.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + QuickSRNetLarge, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_same, skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "quicksrnetlarge_output.png" +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(IMAGE_ADDRESS) + model = QuickSRNetLarge.from_pretrained() + app = SuperResolutionApp(model=model) + output_img = app.upscale_image(image)[0] + + expected_output_image = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_same( + np.asarray(expected_output_image, dtype=np.float32), + np.array(output_img).astype(np.float32), + diff_tol=0.01, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/quicksrnetmedium/README.md b/qai_hub_models/models/quicksrnetmedium/README.md new file mode 100644 index 00000000..0c2a24e4 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [QuickSRNetMedium: Upscale images and remove image noise](https://aihub.qualcomm.com/models/quicksrnetmedium) + +QuickSRNet Medium is designed for upscaling images on mobile platforms to sharpen in real-time. + +This is based on the implementation of QuickSRNetMedium found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/quicksrnetmedium). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.quicksrnetmedium.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.quicksrnetmedium.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of QuickSRNetMedium can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). + + +## References +* [QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms](https://arxiv.org/abs/2303.04336) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet) diff --git a/qai_hub_models/models/quicksrnetmedium/__init__.py b/qai_hub_models/models/quicksrnetmedium/__init__.py new file mode 100644 index 00000000..93bf8ebc --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import QuickSRNetMedium as Model # noqa: F401 diff --git a/qai_hub_models/models/quicksrnetmedium/demo.py b/qai_hub_models/models/quicksrnetmedium/demo.py new file mode 100644 index 00000000..09f0b423 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium/demo.py @@ -0,0 +1,25 @@ +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.quicksrnetmedium.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + QuickSRNetMedium, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "quicksrnet_demo.jpg" +) + + +# Run QuickSRNet end-to-end on a sample image. +# The demo will display an upscaled image +def main(is_test: bool = False): + super_resolution_demo( + model_cls=QuickSRNetMedium, + default_image=IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/quicksrnetmedium/export.py b/qai_hub_models/models/quicksrnetmedium/export.py new file mode 100644 index 00000000..1923278d --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.quicksrnetmedium import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "quicksrnetmedium" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "quicksrnetmedium", + "QuickSRNetMedium", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/quicksrnetmedium/info.yaml b/qai_hub_models/models/quicksrnetmedium/info.yaml new file mode 100644 index 00000000..efa8a561 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium/info.yaml @@ -0,0 +1,30 @@ +name: QuickSRNetMedium +# id must match with the model dir name in qai_hub_models +id: quicksrnetmedium +status: public +headline: Upscale images and remove image noise. +domain: Computer Vision +description: QuickSRNet Medium is designed for upscaling images on mobile platforms to sharpen in real-time. +use_case: Super Resolution +tags: [] +research_paper: https://arxiv.org/abs/2303.04336 +research_paper_title: "QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms" +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet +technical_details: + Number of parameters: 61K + Model size: 266 KB + Model checkpoint: quicksrnet_medium_4x_checkpoint_float32 + Input resolution: 128x128 +applicable_scenarios: + - Virtual Real Estate Tours + - Gaming + - ARVR +form_factors: + - Phone + - Tablet +related_models: ['xlsr', 'esrgan', 'quicksrnetlarge'] +has_static_banner: yes +has_animated_banner: yes +license_type: other +dataset: [] diff --git a/qai_hub_models/models/quicksrnetmedium/model.py b/qai_hub_models/models/quicksrnetmedium/model.py new file mode 100644 index 00000000..1dc8c9e4 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium/model.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.evaluators.superres_evaluator import SuperResolutionOutputEvaluator +from qai_hub_models.models._shared.quicksrnet.common import ( + _load_quicksrnet_source_model, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_medium_4x_w8a8.json +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_medium_4x_checkpoint_float32.pth.tar +QUICKSRNET_WEIGHTS = "quicksrnet_medium_4x_checkpoint_float32.pth.tar" +SCALING_FACTOR = 4 +NUM_CHANNELS = 32 +NUM_INTERMEDIATE_LAYERS = 5 +USE_ITO_CONNECTION = False + + +class QuickSRNetMedium(BaseModel): + """Exportable QuickSRNet-Medium upscaler, end-to-end.""" + + def __init__( + self, + quicksrnet_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = quicksrnet_model + + @classmethod + def from_pretrained(cls) -> QuickSRNetMedium: + model = _load_quicksrnet_source_model( + MODEL_ID, + MODEL_ASSET_VERSION, + SCALING_FACTOR, + NUM_CHANNELS, + NUM_INTERMEDIATE_LAYERS, + USE_ITO_CONNECTION, + ) + dst = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, QUICKSRNET_WEIGHTS + ).fetch() + checkpoint = torch.load(dst, map_location=torch.device("cpu")) + model.load_state_dict(checkpoint["state_dict"]) + model.eval() + + return cls(model) + + def get_evaluator(self) -> BaseEvaluator: + return SuperResolutionOutputEvaluator() + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run QuickSRNet-Medium on `image`, and produce an upscaled image + + Parameters: + image: Pixel values pre-processed for model consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + image: Pixel values + Range: float[0, 1] + 3-channel Color Space: RGB + """ + + return self.model(image) + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 128, + width: int = 128, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} diff --git a/qai_hub_models/models/quicksrnetmedium/perf.yaml b/qai_hub_models/models/quicksrnetmedium/perf.yaml new file mode 100644 index 00000000..5851a60e --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium/perf.yaml @@ -0,0 +1,137 @@ +models: +- name: QuickSRNetMedium + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-08T22:33:17.244157Z' + torchscript_onnx_tflite: + inference_time: 1407.0 + throughput: 710.7320540156361 + estimated_peak_memory_range: + min: 32768 + max: 8364248 + layer_info: + layers_on_npu: 14 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 17 + precision: fp16 + primary_compute_unit: NPU + job_id: jvgd2x1z5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 992.0 + throughput: 1008.0645161290323 + estimated_peak_memory_range: + min: 217088 + max: 28908792 + layer_info: + layers_on_npu: 18 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 18 + precision: fp16 + primary_compute_unit: NPU + job_id: j1gllveeg + job_status: Passed + torchscript_onnx_ort_qnn_htp: + inference_time: 17078.0 + throughput: 58.55486590935707 + estimated_peak_memory_range: + min: 15241216 + max: 26970304 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 8 + total_layers: 8 + precision: fp32 + primary_compute_unit: CPU + job_id: j0pxxkv3p + job_status: Passed + torchscript_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + layer_info: + layers_on_npu: 'null' + layers_on_gpu: 'null' + layers_on_cpu: 'null' + total_layers: 'null' + precision: 'null' + primary_compute_unit: 'null' + job_id: 'null' + job_status: 'null' +aggregated: + supported_devices: + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 Pro + - Xiaomi 13 + - Xiaomi 13 Pro + supported_oses: + - Android + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-08T22:33:19.043922Z' + torchscript_onnx_tflite: + inference_time: 1407.0 + throughput: 710.7320540156361 + estimated_peak_memory_range: + min: 32768 + max: 8364248 + precision: fp16 + primary_compute_unit: NPU + job_status: Passed + torchscript_onnx_qnn: + inference_time: 992.0 + throughput: 1008.0645161290323 + estimated_peak_memory_range: + min: 217088 + max: 28908792 + precision: fp16 + primary_compute_unit: NPU + job_status: Passed + torchscript_onnx_ort_qnn_htp: + inference_time: 17078.0 + throughput: 58.55486590935707 + estimated_peak_memory_range: + min: 15241216 + max: 26970304 + precision: fp32 + primary_compute_unit: CPU + job_status: Passed + torchscript_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0.0 + max: 0.0 + precision: 'null' + primary_compute_unit: 'null' + job_status: 'null' diff --git a/qai_hub_models/models/quicksrnetmedium/test.py b/qai_hub_models/models/quicksrnetmedium/test.py new file mode 100644 index 00000000..ebf24cd7 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium/test.py @@ -0,0 +1,35 @@ +import numpy as np + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.quicksrnetmedium.demo import IMAGE_ADDRESS +from qai_hub_models.models.quicksrnetmedium.demo import main as demo_main +from qai_hub_models.models.quicksrnetmedium.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + QuickSRNetMedium, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_same, skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "quicksrnetmedium_output.png" +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(IMAGE_ADDRESS) + model = QuickSRNetMedium.from_pretrained() + app = SuperResolutionApp(model=model) + output_img = app.upscale_image(image)[0] + + expected_output_image = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_same( + np.asarray(expected_output_image, dtype=np.float32), + np.array(output_img).astype(np.float32), + diff_tol=0.01, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/README.md b/qai_hub_models/models/real_esrgan_general_x4v3/README.md new file mode 100644 index 00000000..5729860d --- /dev/null +++ b/qai_hub_models/models/real_esrgan_general_x4v3/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Real-ESRGAN-General-x4v3: Upscale images and remove image noise](https://aihub.qualcomm.com/models/real_esrgan_general_x4v3) + +Real-ESRGAN is a machine learning model that upscales an image with minimal loss in quality. + +This is based on the implementation of Real-ESRGAN-General-x4v3 found +[here](https://github.com/xinntao/Real-ESRGAN/tree/master). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/real_esrgan_general_x4v3). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[real_esrgan_general_x4v3]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.real_esrgan_general_x4v3.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.real_esrgan_general_x4v3.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Real-ESRGAN-General-x4v3 can be found + [here](https://github.com/xinntao/Real-ESRGAN/blob/master/LICENSE). + + +## References +* [Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data](https://arxiv.org/abs/2107.10833) +* [Source Model Implementation](https://github.com/xinntao/Real-ESRGAN/tree/master) diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/__init__.py b/qai_hub_models/models/real_esrgan_general_x4v3/__init__.py new file mode 100644 index 00000000..eeeb6e70 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_general_x4v3/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import Real_ESRGAN_General_x4v3 as Model # noqa: F401 diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/demo.py b/qai_hub_models/models/real_esrgan_general_x4v3/demo.py new file mode 100644 index 00000000..b79dfbfc --- /dev/null +++ b/qai_hub_models/models/real_esrgan_general_x4v3/demo.py @@ -0,0 +1,26 @@ +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.real_esrgan_general_x4v3.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + Real_ESRGAN_General_x4v3, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +WEIGHTS_HELP_MSG = "RealESRGAN checkpoint `.pth` name from the Real-ESRGAN repo. Can be set to any of the model names defined here: https://github.com/xinntao/Real-ESRGAN/blob/master/docs/model_zoo.md to automatically download the file instead." +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "real_esrgan_general_x4v3_demo.jpg" +) + + +# Run Real-ESRGAN end-to-end on a sample image. +# The demo will display a image with the predicted bounding boxes. +def main(is_test: bool = False): + super_resolution_demo( + model_cls=Real_ESRGAN_General_x4v3, + default_image=IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/export.py b/qai_hub_models/models/real_esrgan_general_x4v3/export.py new file mode 100644 index 00000000..cdacb69e --- /dev/null +++ b/qai_hub_models/models/real_esrgan_general_x4v3/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.real_esrgan_general_x4v3 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "real_esrgan_general_x4v3" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "real_esrgan_general_x4v3", + "Real-ESRGAN-General-x4v3", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/info.yaml b/qai_hub_models/models/real_esrgan_general_x4v3/info.yaml new file mode 100644 index 00000000..b20894ad --- /dev/null +++ b/qai_hub_models/models/real_esrgan_general_x4v3/info.yaml @@ -0,0 +1,34 @@ +name: Real-ESRGAN-General-x4v3 +# id must match with the model dir name in qai_hub_models +id: real_esrgan_general_x4v3 +status: public +headline: Upscale images and remove image noise. +domain: Computer Vision +description: Real-ESRGAN is a machine learning model that upscales an image with minimal loss in quality. +use_case: Super Resolution +tags: [] +research_paper: https://arxiv.org/abs/2107.10833 +research_paper_title: "Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data" +license: https://github.com/xinntao/Real-ESRGAN/blob/master/LICENSE +source_repo: https://github.com/xinntao/Real-ESRGAN/tree/master +technical_details: + Number of parameters: 1.21 M + Model size: 4.76 MB + Model checkpoint: realesr-general-x4v3 + Input resolution: 128x128 +applicable_scenarios: + - Virtual Real Estate Tours + - Gaming + - ARVR +related_models: + - 'esrgan' + - 'real_esrgan_x4plus' +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: [] diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/model.py b/qai_hub_models/models/real_esrgan_general_x4v3/model.py new file mode 100644 index 00000000..d5212ba6 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_general_x4v3/model.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +import os + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.evaluators.superres_evaluator import SuperResolutionOutputEvaluator +from qai_hub_models.utils.asset_loaders import SourceAsRoot +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +REALESRGAN_SOURCE_REPOSITORY = "https://github.com/xinntao/Real-ESRGAN" +REALESRGAN_SOURCE_REPO_COMMIT = "5ca1078535923d485892caee7d7804380bfc87fd" +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_WEIGHTS = "realesr-general-x4v3" +PRE_PAD = 10 +SCALING_FACTOR = 4 + + +class Real_ESRGAN_General_x4v3(BaseModel): + """Exportable RealESRGAN upscaler, end-to-end.""" + + def __init__( + self, + realesrgan_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = realesrgan_model + + @classmethod + def from_pretrained( + cls, + weight_path: str = DEFAULT_WEIGHTS, + ) -> Real_ESRGAN_General_x4v3: + """Load Real_ESRGAN_General_x4v3 from a weightfile created by the source RealESRGAN repository.""" + + # Load PyTorch model from disk + realesrgan_model = _load_realesrgan_source_model_from_weights(weight_path) + + return Real_ESRGAN_General_x4v3(realesrgan_model) + + def get_evaluator(self) -> BaseEvaluator: + return SuperResolutionOutputEvaluator() + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run RealESRGAN on `image`, and produce an upscaled image + Parameters: + image: Pixel values pre-processed for GAN consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + Returns: + image: Pixel values + Range: float[0, 1] + 3-channel Color Space: RGB + """ + return self.model(image) + + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + height: int = 128, + width: int = 128, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} + + +def _get_weightsfile_from_name(weights_name: str = DEFAULT_WEIGHTS): + """Convert from names of weights files to the url for the weights file""" + if weights_name == DEFAULT_WEIGHTS: + return "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth" + return "" + + +def _load_realesrgan_source_model_from_weights( + weights_name_or_path: str, +) -> torch.nn.Module: + with SourceAsRoot( + REALESRGAN_SOURCE_REPOSITORY, + REALESRGAN_SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + ): + # Patch path for this load only, since the model source + # code references modules via a global scope. + # CWD should be the repository path now + realesrgan_repo_path = os.getcwd() + # The official repo omits this folder, which causes import issues + version_dir = os.path.join(realesrgan_repo_path, "realesrgan/version") + if not os.path.exists(version_dir): + os.makedirs(version_dir) + + if os.path.exists(os.path.expanduser(weights_name_or_path)): + weights_path = os.path.expanduser(weights_name_or_path) + else: + weights_path = os.path.join(os.getcwd(), weights_name_or_path + ".pth") + if not os.path.exists(weights_path): + # Load RealESRGAN model from the source repository using the given weights. + # Returns .realesrgan.archs.srvgg_arch + weights_url = _get_weightsfile_from_name(weights_name_or_path) + + # download the weights file + import requests + + response = requests.get(weights_url) + with open(weights_path, "wb") as file: + file.write(response.content) + print(f"Weights file downloaded as {weights_path}") + + # necessary import. `archs` comes from the realesrgan repo. + from realesrgan.archs.srvgg_arch import SRVGGNetCompact + + realesrgan_model = SRVGGNetCompact( + num_in_ch=3, + num_out_ch=3, + num_feat=64, + num_conv=32, + upscale=4, + act_type="prelu", + ) + pretrained_dict = torch.load(weights_path, map_location=torch.device("cpu")) + + if "params_ema" in pretrained_dict: + keyname = "params_ema" + else: + keyname = "params" + realesrgan_model.load_state_dict(pretrained_dict[keyname], strict=True) + + return realesrgan_model diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml b/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml new file mode 100644 index 00000000..5305963f --- /dev/null +++ b/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Real-ESRGAN-General-x4v3 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 7168.0 + throughput: 139.50892857142858 + estimated_peak_memory_range: + min: 15761408 + max: 27106520 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 69 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 72 + job_id: jmg9zy3qp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 6995.0 + throughput: 142.9592566118656 + estimated_peak_memory_range: + min: 45056 + max: 67127640 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 73 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 73 + job_id: jnp1nwdkg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:15:20.798589Z' diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/requirements.txt b/qai_hub_models/models/real_esrgan_general_x4v3/requirements.txt new file mode 100644 index 00000000..80ca5630 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_general_x4v3/requirements.txt @@ -0,0 +1,6 @@ +opencv-python +PyYAML +requests +scipy +seaborn +basicsr diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/test.py b/qai_hub_models/models/real_esrgan_general_x4v3/test.py new file mode 100644 index 00000000..0b11d527 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_general_x4v3/test.py @@ -0,0 +1,35 @@ +import numpy as np + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.real_esrgan_general_x4v3.demo import IMAGE_ADDRESS +from qai_hub_models.models.real_esrgan_general_x4v3.demo import main as demo_main +from qai_hub_models.models.real_esrgan_general_x4v3.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + Real_ESRGAN_General_x4v3, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "real_esrgan_general_x4v3_demo.png" +) + + +@skip_clone_repo_check +def test_realesrgan_app(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + model = Real_ESRGAN_General_x4v3.from_pretrained() + app = SuperResolutionApp(model) + app_output_image = app.upscale_image(image)[0] + np.testing.assert_allclose( + np.asarray(app_output_image, dtype=np.float32), + np.asarray(output_image, dtype=np.float32), + rtol=0.02, + atol=1.5, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/real_esrgan_x4plus/README.md b/qai_hub_models/models/real_esrgan_x4plus/README.md new file mode 100644 index 00000000..b4bb3d0d --- /dev/null +++ b/qai_hub_models/models/real_esrgan_x4plus/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Real-ESRGAN-x4plus: Upscale images and remove image noise](https://aihub.qualcomm.com/models/real_esrgan_x4plus) + +Real-ESRGAN is a machine learning model that upscales an image with minimal loss in quality. The implementation is a derivative of the Real-ESRGAN-x4plus architecture, a larger and more powerful version compared to the Real-ESRGAN-general-x4v3 architecture. + +This is based on the implementation of Real-ESRGAN-x4plus found +[here](https://github.com/xinntao/Real-ESRGAN). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/real_esrgan_x4plus). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[real_esrgan_x4plus]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.real_esrgan_x4plus.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.real_esrgan_x4plus.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Real-ESRGAN-x4plus can be found + [here](https://github.com/xinntao/Real-ESRGAN/blob/master/LICENSE). + + +## References +* [Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data](https://arxiv.org/abs/2107.10833) +* [Source Model Implementation](https://github.com/xinntao/Real-ESRGAN) diff --git a/qai_hub_models/models/real_esrgan_x4plus/__init__.py b/qai_hub_models/models/real_esrgan_x4plus/__init__.py new file mode 100644 index 00000000..a4ce93da --- /dev/null +++ b/qai_hub_models/models/real_esrgan_x4plus/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import Real_ESRGAN_x4plus as Model # noqa: F401 diff --git a/qai_hub_models/models/real_esrgan_x4plus/demo.py b/qai_hub_models/models/real_esrgan_x4plus/demo.py new file mode 100644 index 00000000..65a023f1 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_x4plus/demo.py @@ -0,0 +1,26 @@ +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.real_esrgan_x4plus.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + Real_ESRGAN_x4plus, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "real_esrgan_x4plus_demo.jpg" +) +WEIGHTS_HELP_MSG = "RealESRGAN checkpoint `.pth` name from the Real-ESRGAN repo. Can be set to any of the model names defined here: https://github.com/xinntao/Real-ESRGAN/blob/master/docs/model_zoo.md to automatically download the file instead." + + +# Run Real-ESRGAN end-to-end on a sample image. +# The demo will display a image with the predicted bounding boxes. +def main(is_test: bool = False): + super_resolution_demo( + model_cls=Real_ESRGAN_x4plus, + default_image=IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/real_esrgan_x4plus/export.py b/qai_hub_models/models/real_esrgan_x4plus/export.py new file mode 100644 index 00000000..9c54437f --- /dev/null +++ b/qai_hub_models/models/real_esrgan_x4plus/export.py @@ -0,0 +1,177 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.real_esrgan_x4plus import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "real_esrgan_x4plus" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "real_esrgan_x4plus", + "Real-ESRGAN-x4plus", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=sample_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/real_esrgan_x4plus/info.yaml b/qai_hub_models/models/real_esrgan_x4plus/info.yaml new file mode 100644 index 00000000..3b9bc903 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_x4plus/info.yaml @@ -0,0 +1,30 @@ +name: Real-ESRGAN-x4plus +# id must match with the model dir name in qai_hub_models +id: real_esrgan_x4plus +status: public +headline: Upscale images and remove image noise. +domain: Computer Vision +description: Real-ESRGAN is a machine learning model that upscales an image with minimal loss in quality. The implementation is a derivative of the Real-ESRGAN-x4plus architecture, a larger and more powerful version compared to the Real-ESRGAN-general-x4v3 architecture. +use_case: Super Resolution +tags: [] +research_paper: https://arxiv.org/abs/2107.10833 +research_paper_title: "Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data" +license: https://github.com/xinntao/Real-ESRGAN/blob/master/LICENSE +source_repo: https://github.com/xinntao/Real-ESRGAN +technical_details: + Number of parameters: 16.7M + Model size: 67.1 MB + Model checkpoint: RealESRGAN_x4plus + Input resolution: 128x128 +applicable_scenarios: + - Virtual Real Estate Tours + - Gaming + - ARVR +form_factors: + - Phone + - Tablet +related_models: ['esrgan', 'real_esrgan_general_x4v3'] +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: [] diff --git a/qai_hub_models/models/real_esrgan_x4plus/model.py b/qai_hub_models/models/real_esrgan_x4plus/model.py new file mode 100644 index 00000000..40383cdc --- /dev/null +++ b/qai_hub_models/models/real_esrgan_x4plus/model.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.evaluators.superres_evaluator import SuperResolutionOutputEvaluator +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + SourceAsRoot, + load_torch, +) +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +# The architecture for this RealESRGAN model comes from the original ESRGAN repo +REALESRGAN_SOURCE_REPOSITORY = "https://github.com/xinntao/ESRGAN" +REALESRGAN_SOURCE_REPO_COMMIT = "73e9b634cf987f5996ac2dd33f4050922398a921" +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 3 +DEFAULT_WEIGHTS = "RealESRGAN_x4plus" +DEFAULT_WEIGHTS_URL = CachedWebModelAsset( + "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth", + MODEL_ID, + MODEL_ASSET_VERSION, + "RealESRGAN_x4plus.pth", +) +PRE_PAD = 10 +SCALING_FACTOR = 4 + + +class Real_ESRGAN_x4plus(BaseModel): + """Exportable RealESRGAN upscaler, end-to-end.""" + + def __init__( + self, + realesrgan_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = realesrgan_model + + @classmethod + def from_pretrained( + cls, + weight_path: str = DEFAULT_WEIGHTS, + ) -> Real_ESRGAN_x4plus: + """Load RealESRGAN from a weightfile created by the source RealESRGAN repository.""" + + # Load PyTorch model from disk + realesrgan_model = _load_realesrgan_source_model_from_weights( + weight_path + ).eval() + + return cls(realesrgan_model) + + def get_evaluator(self) -> BaseEvaluator: + return SuperResolutionOutputEvaluator() + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run RealESRGAN on `image`, and produce an upscaled image + + Parameters: + image: Pixel values pre-processed for GAN consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + image: Pixel values + Range: float[0, 1] + 3-channel Color Space: RGB + """ + + with torch.no_grad(): + # upscale + output = self.model(image) + + output_img = output.squeeze().float().cpu().clamp_(0, 1) + + return output_img + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 128, + width: int = 128, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} + + +def _get_weightsfile_from_name(weights_name: str = DEFAULT_WEIGHTS): + """Convert from names of weights files to the url for the weights file""" + if weights_name == DEFAULT_WEIGHTS: + return DEFAULT_WEIGHTS_URL + return "" + + +def _load_realesrgan_source_model_from_weights(weights_name: str) -> torch.nn.Module: + # Load RealESRGAN model from the source repository using the given weights. + # Returns .realesrgan.archs.srvgg_arch + weights_url = _get_weightsfile_from_name(weights_name) + + with SourceAsRoot( + REALESRGAN_SOURCE_REPOSITORY, + REALESRGAN_SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + ): + # necessary import. `archs` comes from the realesrgan repo. + from basicsr.archs.rrdbnet_arch import RRDBNet + + realesrgan_model = RRDBNet( + num_in_ch=3, + num_out_ch=3, + num_feat=64, + num_block=23, + num_grow_ch=32, + scale=SCALING_FACTOR, + ) + pretrained_dict = load_torch(weights_url) + + if "params_ema" in pretrained_dict: + keyname = "params_ema" + else: + keyname = "params" + realesrgan_model.load_state_dict(pretrained_dict[keyname], strict=True) + + return realesrgan_model diff --git a/qai_hub_models/models/real_esrgan_x4plus/perf.yaml b/qai_hub_models/models/real_esrgan_x4plus/perf.yaml new file mode 100644 index 00000000..76ef0ef8 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_x4plus/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Real-ESRGAN-x4plus + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + torchscript_onnx_qnn: + inference_time: 67244.0 + throughput: 14.87121527571233 + estimated_peak_memory_range: + min: 102400 + max: 106071688 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1031 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1031 + job_id: jygzljxz5 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:06:00.248808Z' diff --git a/qai_hub_models/models/real_esrgan_x4plus/requirements.txt b/qai_hub_models/models/real_esrgan_x4plus/requirements.txt new file mode 100644 index 00000000..6292b978 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_x4plus/requirements.txt @@ -0,0 +1,4 @@ +opencv-python +scipy +seaborn +basicsr diff --git a/qai_hub_models/models/real_esrgan_x4plus/test.py b/qai_hub_models/models/real_esrgan_x4plus/test.py new file mode 100644 index 00000000..cfa66ca2 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_x4plus/test.py @@ -0,0 +1,35 @@ +import numpy as np + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.real_esrgan_x4plus.demo import IMAGE_ADDRESS +from qai_hub_models.models.real_esrgan_x4plus.demo import main as demo_main +from qai_hub_models.models.real_esrgan_x4plus.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + Real_ESRGAN_x4plus, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_same, skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "real_esrgan_x4plus_demo_output.png" +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(IMAGE_ADDRESS) + model = Real_ESRGAN_x4plus.from_pretrained() + app = SuperResolutionApp(model=model) + output_img = app.upscale_image(image)[0] + + expected_output_image = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_same( + np.asarray(expected_output_image, dtype=np.float32), + np.array(output_img).astype(np.float32), + diff_tol=0.01, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/regnet/README.md b/qai_hub_models/models/regnet/README.md new file mode 100644 index 00000000..2a1a1082 --- /dev/null +++ b/qai_hub_models/models/regnet/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [RegNet: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/regnet) + +RegNet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of RegNet found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/regnet). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.regnet.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.regnet.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of RegNet can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py) diff --git a/qai_hub_models/models/regnet/__init__.py b/qai_hub_models/models/regnet/__init__.py new file mode 100644 index 00000000..4a181edb --- /dev/null +++ b/qai_hub_models/models/regnet/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import RegNet as Model # noqa: F401 diff --git a/qai_hub_models/models/regnet/demo.py b/qai_hub_models/models/regnet/demo.py new file mode 100644 index 00000000..677d4795 --- /dev/null +++ b/qai_hub_models/models/regnet/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.regnet.model import RegNet + + +def main(is_test: bool = False): + imagenet_demo(RegNet, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/regnet/export.py b/qai_hub_models/models/regnet/export.py new file mode 100644 index 00000000..3b4fe268 --- /dev/null +++ b/qai_hub_models/models/regnet/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.regnet import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "regnet" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "regnet", + "RegNet", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/regnet/info.yaml b/qai_hub_models/models/regnet/info.yaml new file mode 100644 index 00000000..ad3edc13 --- /dev/null +++ b/qai_hub_models/models/regnet/info.yaml @@ -0,0 +1,38 @@ +name: RegNet +# id must match with the model dir name in qai_hub_models +id: regnet +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: RegNet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone +research_paper: https://arxiv.org/abs/2003.13678 +research_paper_title: Designing Network Design Spaces +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py +technical_details: + Number of parameters: 15.3M + Model size: 59.2 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/regnet/model.py b/qai_hub_models/models/regnet/model.py new file mode 100644 index 00000000..2066a62f --- /dev/null +++ b/qai_hub_models/models/regnet/model.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" +MODEL_ASSET_VERSION = 3 + + +class RegNet(ImagenetClassifier): + model_builder = tv_models.regnet_x_3_2gf + DEFAULT_WEIGHTS = DEFAULT_WEIGHTS diff --git a/qai_hub_models/models/regnet/perf.yaml b/qai_hub_models/models/regnet/perf.yaml new file mode 100644 index 00000000..896bd9cb --- /dev/null +++ b/qai_hub_models/models/regnet/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: RegNet + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1921.0 + throughput: 520.5622071837585 + estimated_peak_memory_range: + min: 16384 + max: 1931624 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 112 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 112 + job_id: jogk2q8og + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1659.0 + throughput: 602.7727546714889 + estimated_peak_memory_range: + min: 237568 + max: 59498896 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 187 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 187 + job_id: jn5qlrvmp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:36:39.546315Z' diff --git a/qai_hub_models/models/regnet/test.py b/qai_hub_models/models/regnet/test.py new file mode 100644 index 00000000..73bd8903 --- /dev/null +++ b/qai_hub_models/models/regnet/test.py @@ -0,0 +1,26 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.regnet.demo import main as demo_main +from qai_hub_models.models.regnet.model import MODEL_ASSET_VERSION, MODEL_ID, RegNet + + +def test_task(): + run_imagenet_classifier_test( + RegNet.from_pretrained(), + MODEL_ID, + probability_threshold=0.45, + atol=0.2, + rtol=0.2, + asset_version=MODEL_ASSET_VERSION, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test(RegNet.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/resnet101/README.md b/qai_hub_models/models/resnet101/README.md new file mode 100644 index 00000000..f4701eef --- /dev/null +++ b/qai_hub_models/models/resnet101/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ResNet101: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/resnet101) + +ResNet101 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ResNet101 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnet101). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.resnet101.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.resnet101.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ResNet101 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) diff --git a/qai_hub_models/models/resnet101/__init__.py b/qai_hub_models/models/resnet101/__init__.py new file mode 100644 index 00000000..9134e438 --- /dev/null +++ b/qai_hub_models/models/resnet101/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ResNet101 as Model # noqa: F401 diff --git a/qai_hub_models/models/resnet101/demo.py b/qai_hub_models/models/resnet101/demo.py new file mode 100644 index 00000000..361c0e31 --- /dev/null +++ b/qai_hub_models/models/resnet101/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.resnet101.model import ResNet101 + + +def main(is_test: bool = False): + imagenet_demo(ResNet101, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnet101/export.py b/qai_hub_models/models/resnet101/export.py new file mode 100644 index 00000000..83d5f53a --- /dev/null +++ b/qai_hub_models/models/resnet101/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.resnet101 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "resnet101" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "resnet101", + "ResNet101", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnet101/info.yaml b/qai_hub_models/models/resnet101/info.yaml new file mode 100644 index 00000000..c5afee8e --- /dev/null +++ b/qai_hub_models/models/resnet101/info.yaml @@ -0,0 +1,38 @@ +name: ResNet101 +# id must match with the model dir name in qai_hub_models +id: resnet101 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +use_case: Image Classification +description: ResNet101 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +tags: + - backbone +research_paper: https://arxiv.org/abs/1512.03385 +research_paper_title: Deep Residual Learning for Image Recognition +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Number of parameters: 44.5M + Model size: 171 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/resnet101/model.py b/qai_hub_models/models/resnet101/model.py new file mode 100644 index 00000000..69c5273d --- /dev/null +++ b/qai_hub_models/models/resnet101/model.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class ResNet101(ImagenetClassifier): + model_builder = tv_models.resnet101 + DEFAULT_WEIGHTS = DEFAULT_WEIGHTS diff --git a/qai_hub_models/models/resnet101/perf.yaml b/qai_hub_models/models/resnet101/perf.yaml new file mode 100644 index 00000000..3baa5107 --- /dev/null +++ b/qai_hub_models/models/resnet101/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: ResNet101 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 3008.0 + throughput: 332.4468085106383 + estimated_peak_memory_range: + min: 28672 + max: 1505496 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 145 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 145 + job_id: jnp1nw6lg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 2895.0 + throughput: 345.4231433506045 + estimated_peak_memory_range: + min: 622592 + max: 226606408 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 244 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 244 + job_id: jvgddq2lg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:20:33.212112Z' diff --git a/qai_hub_models/models/resnet101/test.py b/qai_hub_models/models/resnet101/test.py new file mode 100644 index 00000000..94f4f5ec --- /dev/null +++ b/qai_hub_models/models/resnet101/test.py @@ -0,0 +1,26 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.resnet101.demo import main as demo_main +from qai_hub_models.models.resnet101.model import MODEL_ID, ResNet101 + + +def test_task(): + run_imagenet_classifier_test( + ResNet101.from_pretrained(), + MODEL_ID, + probability_threshold=0.45, + diff_tol=0.005, + rtol=0.02, + atol=0.02, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test(ResNet101.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/resnet101_quantized/README.md b/qai_hub_models/models/resnet101_quantized/README.md new file mode 100644 index 00000000..4f165dfb --- /dev/null +++ b/qai_hub_models/models/resnet101_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ResNet101Quantized: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/resnet101_quantized) + +ResNet101 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ResNet101Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnet101_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.resnet101_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.resnet101_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ResNet101Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) diff --git a/qai_hub_models/models/resnet101_quantized/__init__.py b/qai_hub_models/models/resnet101_quantized/__init__.py new file mode 100644 index 00000000..229e11f6 --- /dev/null +++ b/qai_hub_models/models/resnet101_quantized/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ResNet101Quantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/resnet101_quantized/demo.py b/qai_hub_models/models/resnet101_quantized/demo.py new file mode 100644 index 00000000..1efb0bda --- /dev/null +++ b/qai_hub_models/models/resnet101_quantized/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.resnet101_quantized.model import ResNet101Quantizable + + +def main(is_test: bool = False): + imagenet_demo(ResNet101Quantizable, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnet101_quantized/export.py b/qai_hub_models/models/resnet101_quantized/export.py new file mode 100644 index 00000000..49f3023f --- /dev/null +++ b/qai_hub_models/models/resnet101_quantized/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.resnet101_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "resnet101_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "resnet101_quantized", + "ResNet101Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnet101_quantized/info.yaml b/qai_hub_models/models/resnet101_quantized/info.yaml new file mode 100644 index 00000000..716af3e8 --- /dev/null +++ b/qai_hub_models/models/resnet101_quantized/info.yaml @@ -0,0 +1,39 @@ +name: ResNet101Quantized +# id must match with the model dir name in qai_hub_models +id: resnet101_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +use_case: Image Classification +description: ResNet101 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +tags: + - backbone + - quantized +research_paper: https://arxiv.org/abs/1512.03385 +research_paper_title: Deep Residual Learning for Image Recognition +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Number of parameters: 44.5M + Model size: 171 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/resnet101_quantized/model.py b/qai_hub_models/models/resnet101_quantized/model.py new file mode 100644 index 00000000..81f8696e --- /dev/null +++ b/qai_hub_models/models/resnet101_quantized/model.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import ( + equalize_bn_folded_model, + fold_all_batch_norms, +) +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.resnet101.model import ResNet101 +from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 3 +DEFAULT_ENCODINGS = "resnet101_quantized_encodings.json" + + +class ResNet101Quantizable(AIMETQuantizableMixin, ResNet101): + """ResNet101 with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + sim_model: QuantizationSimModel, + ) -> None: + ResNet101.__init__(self, sim_model.model) + AIMETQuantizableMixin.__init__( + self, sim_model, needs_onnx_direct_aimet_export=False + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "ResNet101Quantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + model = ResNet101.from_pretrained() + input_shape = model.get_input_spec()["image_tensor"][0] + dummy_input = torch.rand(input_shape) + pairs = fold_all_batch_norms(model, input_shape, dummy_input) + equalize_bn_folded_model(model, input_shape, pairs, dummy_input) + + sim = QuantizationSimModel( + model.net, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_per_channel_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) diff --git a/qai_hub_models/models/resnet101_quantized/perf.yaml b/qai_hub_models/models/resnet101_quantized/perf.yaml new file mode 100644 index 00000000..b8f30516 --- /dev/null +++ b/qai_hub_models/models/resnet101_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: ResNet101Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 74926.0 + throughput: 13.346501881856765 + estimated_peak_memory_range: + min: 151552 + max: 2762960 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 149 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 149 + job_id: joprl2nep + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:31:35.238685Z' diff --git a/qai_hub_models/models/resnet101_quantized/test.py b/qai_hub_models/models/resnet101_quantized/test.py new file mode 100644 index 00000000..26716415 --- /dev/null +++ b/qai_hub_models/models/resnet101_quantized/test.py @@ -0,0 +1,37 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.resnet101_quantized.demo import main as demo_main +from qai_hub_models.models.resnet101_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ResNet101Quantizable, +) + + +def test_task(): + run_imagenet_classifier_test( + ResNet101Quantizable.from_pretrained(), + MODEL_ID, + probability_threshold=0.45, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + asset_version=MODEL_ASSET_VERSION, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test( + ResNet101Quantizable.from_pretrained(), + is_quantized=True, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/resnet18/README.md b/qai_hub_models/models/resnet18/README.md new file mode 100644 index 00000000..f80fd950 --- /dev/null +++ b/qai_hub_models/models/resnet18/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ResNet18: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/resnet18) + +ResNet18 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ResNet18 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnet18). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.resnet18.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.resnet18.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ResNet18 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) diff --git a/qai_hub_models/models/resnet18/__init__.py b/qai_hub_models/models/resnet18/__init__.py new file mode 100644 index 00000000..241cc1f4 --- /dev/null +++ b/qai_hub_models/models/resnet18/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ResNet18 as Model # noqa: F401 diff --git a/qai_hub_models/models/resnet18/demo.py b/qai_hub_models/models/resnet18/demo.py new file mode 100644 index 00000000..542e8528 --- /dev/null +++ b/qai_hub_models/models/resnet18/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.resnet18.model import ResNet18 + + +def main(is_test: bool = False): + imagenet_demo(ResNet18, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnet18/export.py b/qai_hub_models/models/resnet18/export.py new file mode 100644 index 00000000..0a0277f2 --- /dev/null +++ b/qai_hub_models/models/resnet18/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.resnet18 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "resnet18" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "resnet18", + "ResNet18", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnet18/info.yaml b/qai_hub_models/models/resnet18/info.yaml new file mode 100644 index 00000000..c493b42c --- /dev/null +++ b/qai_hub_models/models/resnet18/info.yaml @@ -0,0 +1,38 @@ +name: ResNet18 +# id must match with the model dir name in qai_hub_models +id: resnet18 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +use_case: Image Classification +description: ResNet18 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +tags: + - backbone +research_paper: https://arxiv.org/abs/1512.03385 +research_paper_title: Deep Residual Learning for Image Recognition +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Number of parameters: 11.7M + Model size: 44.7 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/resnet18/model.py b/qai_hub_models/models/resnet18/model.py new file mode 100644 index 00000000..b2edfa6d --- /dev/null +++ b/qai_hub_models/models/resnet18/model.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class ResNet18(ImagenetClassifier): + model_builder = tv_models.resnet18 + DEFAULT_WEIGHTS = DEFAULT_WEIGHTS diff --git a/qai_hub_models/models/resnet18/perf.yaml b/qai_hub_models/models/resnet18/perf.yaml new file mode 100644 index 00000000..f58a8d78 --- /dev/null +++ b/qai_hub_models/models/resnet18/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: ResNet18 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1054.0 + throughput: 948.7666034155598 + estimated_peak_memory_range: + min: 12288 + max: 1722456 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 36 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 36 + job_id: j1p3z1xx5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 980.0 + throughput: 1020.4081632653061 + estimated_peak_memory_range: + min: 16384 + max: 84353688 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 52 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 52 + job_id: jwgolno4g + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:25:13.005640Z' diff --git a/qai_hub_models/models/resnet18/test.py b/qai_hub_models/models/resnet18/test.py new file mode 100644 index 00000000..36854dbb --- /dev/null +++ b/qai_hub_models/models/resnet18/test.py @@ -0,0 +1,26 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.resnet18.demo import main as demo_main +from qai_hub_models.models.resnet18.model import MODEL_ID, ResNet18 + + +def test_task(): + run_imagenet_classifier_test( + ResNet18.from_pretrained(), + MODEL_ID, + probability_threshold=0.45, + diff_tol=0.005, + atol=0.02, + rtol=0.2, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test(ResNet18.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/resnet18_quantized/README.md b/qai_hub_models/models/resnet18_quantized/README.md new file mode 100644 index 00000000..5b2a2518 --- /dev/null +++ b/qai_hub_models/models/resnet18_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ResNet18Quantized: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/resnet18_quantized) + +ResNet18 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ResNet18Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnet18_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.resnet18_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.resnet18_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ResNet18Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) diff --git a/qai_hub_models/models/resnet18_quantized/__init__.py b/qai_hub_models/models/resnet18_quantized/__init__.py new file mode 100644 index 00000000..adead19f --- /dev/null +++ b/qai_hub_models/models/resnet18_quantized/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ResNet18Quantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/resnet18_quantized/demo.py b/qai_hub_models/models/resnet18_quantized/demo.py new file mode 100644 index 00000000..6b13e190 --- /dev/null +++ b/qai_hub_models/models/resnet18_quantized/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.resnet18_quantized.model import ResNet18Quantizable + + +def main(is_test: bool = False): + imagenet_demo(ResNet18Quantizable, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnet18_quantized/export.py b/qai_hub_models/models/resnet18_quantized/export.py new file mode 100644 index 00000000..070f3820 --- /dev/null +++ b/qai_hub_models/models/resnet18_quantized/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.resnet18_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "resnet18_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "resnet18_quantized", + "ResNet18Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnet18_quantized/info.yaml b/qai_hub_models/models/resnet18_quantized/info.yaml new file mode 100644 index 00000000..8f1f101e --- /dev/null +++ b/qai_hub_models/models/resnet18_quantized/info.yaml @@ -0,0 +1,39 @@ +name: ResNet18Quantized +# id must match with the model dir name in qai_hub_models +id: resnet18_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +use_case: Image Classification +description: ResNet18 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +tags: + - backbone + - quantized +research_paper: https://arxiv.org/abs/1512.03385 +research_paper_title: Deep Residual Learning for Image Recognition +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Number of parameters: 11.7M + Model size: 44.7 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/resnet18_quantized/model.py b/qai_hub_models/models/resnet18_quantized/model.py new file mode 100644 index 00000000..bf64cfe8 --- /dev/null +++ b/qai_hub_models/models/resnet18_quantized/model.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.resnet18.model import ResNet18 +from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 5 +DEFAULT_ENCODINGS = "resnet18_quantized_encodings.json" + + +class ResNet18Quantizable(AIMETQuantizableMixin, ResNet18): + """ResNet with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + resnet18_model: QuantizationSimModel, + ) -> None: + ResNet18.__init__(self, resnet18_model.model) + AIMETQuantizableMixin.__init__( + self, resnet18_model, needs_onnx_direct_aimet_export=False + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "ResNet18Quantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + resnet18 = ResNet18.from_pretrained() + input_shape = resnet18.get_input_spec()["image_tensor"][0] + + equalize_model(resnet18, input_shape) + sim = QuantizationSimModel( + resnet18.net, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_per_channel_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) diff --git a/qai_hub_models/models/resnet18_quantized/perf.yaml b/qai_hub_models/models/resnet18_quantized/perf.yaml new file mode 100644 index 00000000..ba508ccb --- /dev/null +++ b/qai_hub_models/models/resnet18_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: ResNet18Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 50502.0 + throughput: 19.801195992237933 + estimated_peak_memory_range: + min: 49152 + max: 14562768 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 40 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 40 + job_id: jegnzmrvg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:33:04.350551Z' diff --git a/qai_hub_models/models/resnet18_quantized/test.py b/qai_hub_models/models/resnet18_quantized/test.py new file mode 100644 index 00000000..fe0eaacf --- /dev/null +++ b/qai_hub_models/models/resnet18_quantized/test.py @@ -0,0 +1,37 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.resnet18_quantized.demo import main as demo_main +from qai_hub_models.models.resnet18_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ResNet18Quantizable, +) + + +def test_task(): + run_imagenet_classifier_test( + ResNet18Quantizable.from_pretrained(), + MODEL_ID, + probability_threshold=0.45, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + asset_version=MODEL_ASSET_VERSION, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test( + ResNet18Quantizable.from_pretrained(), + diff_tol=0.007, + rtol=0.02, + atol=0.2, + is_quantized=True, + ) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/resnet50/README.md b/qai_hub_models/models/resnet50/README.md new file mode 100644 index 00000000..66d0a0e8 --- /dev/null +++ b/qai_hub_models/models/resnet50/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ResNet50: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/resnet50) + +ResNet50 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ResNet50 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnet50). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.resnet50.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.resnet50.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ResNet50 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) diff --git a/qai_hub_models/models/resnet50/__init__.py b/qai_hub_models/models/resnet50/__init__.py new file mode 100644 index 00000000..9b5ef003 --- /dev/null +++ b/qai_hub_models/models/resnet50/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ResNet50 as Model # noqa: F401 diff --git a/qai_hub_models/models/resnet50/demo.py b/qai_hub_models/models/resnet50/demo.py new file mode 100644 index 00000000..f6afa0d8 --- /dev/null +++ b/qai_hub_models/models/resnet50/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.resnet50.model import ResNet50 + + +def main(is_test: bool = False): + imagenet_demo(ResNet50, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnet50/export.py b/qai_hub_models/models/resnet50/export.py new file mode 100644 index 00000000..5ff31d8a --- /dev/null +++ b/qai_hub_models/models/resnet50/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.resnet50 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "resnet50" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "resnet50", + "ResNet50", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnet50/info.yaml b/qai_hub_models/models/resnet50/info.yaml new file mode 100644 index 00000000..b3ea298a --- /dev/null +++ b/qai_hub_models/models/resnet50/info.yaml @@ -0,0 +1,39 @@ +name: ResNet50 +# id must match with the model dir name in qai_hub_models +id: resnet50 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +use_case: Image Classification +description: ResNet50 is a machine learning model that can classify images from the + Imagenet dataset. It can also be used as a backbone in building more complex models + for specific use cases. +tags: +- backbone +research_paper: https://arxiv.org/abs/1512.03385 +research_paper_title: Deep Residual Learning for Image Recognition +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Number of parameters: 25.6M + Model size: 97.5 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: +- Medical Imaging +- Anomaly Detection +- Inventory Management +related_models: +- mobilenet_v2 +- densenet121 +- googlenet +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/resnet50/model.py b/qai_hub_models/models/resnet50/model.py new file mode 100644 index 00000000..ebccb5a3 --- /dev/null +++ b/qai_hub_models/models/resnet50/model.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class ResNet50(ImagenetClassifier): + model_builder = tv_models.resnet50 + DEFAULT_WEIGHTS = DEFAULT_WEIGHTS diff --git a/qai_hub_models/models/resnet50/perf.yaml b/qai_hub_models/models/resnet50/perf.yaml new file mode 100644 index 00000000..f070af48 --- /dev/null +++ b/qai_hub_models/models/resnet50/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: ResNet50 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1904.0 + throughput: 525.2100840336135 + estimated_peak_memory_range: + min: 20480 + max: 2314168 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 77 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 77 + job_id: j1p8em6zp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1768.0 + throughput: 565.6108597285067 + estimated_peak_memory_range: + min: 634880 + max: 186280024 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 125 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 125 + job_id: jogk2qoyg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:07:34.762219Z' diff --git a/qai_hub_models/models/resnet50/test.py b/qai_hub_models/models/resnet50/test.py new file mode 100644 index 00000000..3da76648 --- /dev/null +++ b/qai_hub_models/models/resnet50/test.py @@ -0,0 +1,26 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.resnet50.demo import main as demo_main +from qai_hub_models.models.resnet50.model import MODEL_ID, ResNet50 + + +def test_task(): + run_imagenet_classifier_test( + ResNet50.from_pretrained(), + MODEL_ID, + probability_threshold=0.45, + diff_tol=0.005, + atol=0.02, + rtol=0.2, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test(ResNet50.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/resnext101/README.md b/qai_hub_models/models/resnext101/README.md new file mode 100644 index 00000000..f8dd7a49 --- /dev/null +++ b/qai_hub_models/models/resnext101/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ResNeXt101: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/resnext101) + +ResNeXt101 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ResNeXt101 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnext101). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.resnext101.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.resnext101.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ResNeXt101 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) diff --git a/qai_hub_models/models/resnext101/__init__.py b/qai_hub_models/models/resnext101/__init__.py new file mode 100644 index 00000000..7ebe4dfa --- /dev/null +++ b/qai_hub_models/models/resnext101/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ResNeXt101 as Model # noqa: F401 diff --git a/qai_hub_models/models/resnext101/demo.py b/qai_hub_models/models/resnext101/demo.py new file mode 100644 index 00000000..f983e663 --- /dev/null +++ b/qai_hub_models/models/resnext101/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.resnext101.model import ResNeXt101 + + +def main(is_test: bool = False): + imagenet_demo(ResNeXt101, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnext101/export.py b/qai_hub_models/models/resnext101/export.py new file mode 100644 index 00000000..8a6c2bf0 --- /dev/null +++ b/qai_hub_models/models/resnext101/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.resnext101 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "resnext101" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "resnext101", + "ResNeXt101", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnext101/info.yaml b/qai_hub_models/models/resnext101/info.yaml new file mode 100644 index 00000000..d27015ff --- /dev/null +++ b/qai_hub_models/models/resnext101/info.yaml @@ -0,0 +1,39 @@ +name: ResNeXt101 +# id must match with the model dir name in qai_hub_models +id: resnext101 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +use_case: Image Classification +description: ResNeXt101 is a machine learning model that can classify images from + the Imagenet dataset. It can also be used as a backbone in building more complex + models for specific use cases. +tags: +- backbone +research_paper: https://arxiv.org/abs/1611.05431 +research_paper_title: Aggregated Residual Transformations for Deep Neural Networks +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Number of parameters: 88.8M + Model size: 338 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: +- Medical Imaging +- Anomaly Detection +- Inventory Management +related_models: +- mobilenet_v2 +- densenet121 +- googlenet +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/resnext101/model.py b/qai_hub_models/models/resnext101/model.py new file mode 100644 index 00000000..39d8db1e --- /dev/null +++ b/qai_hub_models/models/resnext101/model.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class ResNeXt101(ImagenetClassifier): + model_builder = tv_models.resnext101_32x8d + DEFAULT_WEIGHTS = DEFAULT_WEIGHTS diff --git a/qai_hub_models/models/resnext101/perf.yaml b/qai_hub_models/models/resnext101/perf.yaml new file mode 100644 index 00000000..05f03000 --- /dev/null +++ b/qai_hub_models/models/resnext101/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: ResNeXt101 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 6434.0 + throughput: 155.4243083618278 + estimated_peak_memory_range: + min: 28672 + max: 2709368 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 145 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 145 + job_id: j1pvlr475 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 6146.0 + throughput: 162.70745200130165 + estimated_peak_memory_range: + min: 16384 + max: 38657672 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 244 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 244 + job_id: j7gjr217p + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:21:26.759411Z' diff --git a/qai_hub_models/models/resnext101/test.py b/qai_hub_models/models/resnext101/test.py new file mode 100644 index 00000000..9a58a983 --- /dev/null +++ b/qai_hub_models/models/resnext101/test.py @@ -0,0 +1,21 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.resnext101.demo import main as demo_main +from qai_hub_models.models.resnext101.model import MODEL_ID, ResNeXt101 + + +def test_task(): + run_imagenet_classifier_test( + ResNeXt101.from_pretrained(), MODEL_ID, atol=0.02, rtol=0.02, diff_tol=0.005 + ) + + +def test_trace(): + run_imagenet_classifier_trace_test(ResNeXt101.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/resnext101_quantized/README.md b/qai_hub_models/models/resnext101_quantized/README.md new file mode 100644 index 00000000..51d5dd6f --- /dev/null +++ b/qai_hub_models/models/resnext101_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ResNeXt101Quantized: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/resnext101_quantized) + +ResNeXt101 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ResNeXt101Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnext101_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.resnext101_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.resnext101_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ResNeXt101Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) diff --git a/qai_hub_models/models/resnext101_quantized/__init__.py b/qai_hub_models/models/resnext101_quantized/__init__.py new file mode 100644 index 00000000..a5ed0a1c --- /dev/null +++ b/qai_hub_models/models/resnext101_quantized/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ResNeXt101Quantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/resnext101_quantized/demo.py b/qai_hub_models/models/resnext101_quantized/demo.py new file mode 100644 index 00000000..afbd5796 --- /dev/null +++ b/qai_hub_models/models/resnext101_quantized/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.resnext101_quantized.model import ResNeXt101Quantizable + + +def main(is_test: bool = False): + imagenet_demo(ResNeXt101Quantizable, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnext101_quantized/export.py b/qai_hub_models/models/resnext101_quantized/export.py new file mode 100644 index 00000000..a157bbe0 --- /dev/null +++ b/qai_hub_models/models/resnext101_quantized/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.resnext101_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "resnext101_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "resnext101_quantized", + "ResNeXt101Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnext101_quantized/info.yaml b/qai_hub_models/models/resnext101_quantized/info.yaml new file mode 100644 index 00000000..eae898e9 --- /dev/null +++ b/qai_hub_models/models/resnext101_quantized/info.yaml @@ -0,0 +1,39 @@ +name: ResNeXt101Quantized +# id must match with the model dir name in qai_hub_models +id: resnext101_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +use_case: Image Classification +description: ResNeXt101 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +tags: + - backbone + - quantized +research_paper: https://arxiv.org/abs/1611.05431 +research_paper_title: Aggregated Residual Transformations for Deep Neural Networks +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Number of parameters: 88.8M + Model size: 340 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/resnext101_quantized/model.py b/qai_hub_models/models/resnext101_quantized/model.py new file mode 100644 index 00000000..8fa9bbfe --- /dev/null +++ b/qai_hub_models/models/resnext101_quantized/model.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.resnext101.model import ResNeXt101 +from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 3 +DEFAULT_ENCODINGS = "resnext101_quantized_encodings.json" + + +class ResNeXt101Quantizable(AIMETQuantizableMixin, ResNeXt101): + """ResNeXt101 with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + sim_model: QuantizationSimModel, + ) -> None: + ResNeXt101.__init__(self, sim_model.model) + AIMETQuantizableMixin.__init__( + self, sim_model, needs_onnx_direct_aimet_export=False + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "ResNeXt101Quantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + model = ResNeXt101.from_pretrained() + input_shape = model.get_input_spec()["image_tensor"][0] + + equalize_model(model, input_shape) + sim = QuantizationSimModel( + model.net, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_per_channel_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) diff --git a/qai_hub_models/models/resnext101_quantized/perf.yaml b/qai_hub_models/models/resnext101_quantized/perf.yaml new file mode 100644 index 00000000..0d14a5d4 --- /dev/null +++ b/qai_hub_models/models/resnext101_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: ResNeXt101Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 76378.0 + throughput: 13.092775406530677 + estimated_peak_memory_range: + min: 143360 + max: 3223784 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 149 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 149 + job_id: jmg9zy8qp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:24:55.190881Z' diff --git a/qai_hub_models/models/resnext101_quantized/test.py b/qai_hub_models/models/resnext101_quantized/test.py new file mode 100644 index 00000000..3650133e --- /dev/null +++ b/qai_hub_models/models/resnext101_quantized/test.py @@ -0,0 +1,37 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.resnext101_quantized.demo import main as demo_main +from qai_hub_models.models.resnext101_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ResNeXt101Quantizable, +) + + +def test_task(): + run_imagenet_classifier_test( + ResNeXt101Quantizable.from_pretrained(), + MODEL_ID, + probability_threshold=0.46, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + asset_version=MODEL_ASSET_VERSION, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test( + ResNeXt101Quantizable.from_pretrained(), + is_quantized=True, + diff_tol=0.007, + rtol=0.02, + atol=0.2, + ) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/resnext50/README.md b/qai_hub_models/models/resnext50/README.md new file mode 100644 index 00000000..60c0361d --- /dev/null +++ b/qai_hub_models/models/resnext50/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ResNeXt50: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/resnext50) + +ResNeXt50 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ResNeXt50 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnext50). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.resnext50.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.resnext50.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of ResNeXt50 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) diff --git a/qai_hub_models/models/resnext50/__init__.py b/qai_hub_models/models/resnext50/__init__.py new file mode 100644 index 00000000..b4a237f1 --- /dev/null +++ b/qai_hub_models/models/resnext50/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ResNeXt50 as Model # noqa: F401 diff --git a/qai_hub_models/models/resnext50/demo.py b/qai_hub_models/models/resnext50/demo.py new file mode 100644 index 00000000..15236f1a --- /dev/null +++ b/qai_hub_models/models/resnext50/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.resnext50.model import ResNeXt50 + + +def main(is_test: bool = False): + imagenet_demo(ResNeXt50, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnext50/export.py b/qai_hub_models/models/resnext50/export.py new file mode 100644 index 00000000..96c7f2be --- /dev/null +++ b/qai_hub_models/models/resnext50/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.resnext50 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "resnext50" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "resnext50", + "ResNeXt50", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnext50/info.yaml b/qai_hub_models/models/resnext50/info.yaml new file mode 100644 index 00000000..443af9f0 --- /dev/null +++ b/qai_hub_models/models/resnext50/info.yaml @@ -0,0 +1,39 @@ +name: ResNeXt50 +# id must match with the model dir name in qai_hub_models +id: resnext50 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: ResNeXt50 is a machine learning model that can classify images from the + Imagenet dataset. It can also be used as a backbone in building more complex models + for specific use cases. +use_case: Image Classification +tags: +- backbone +research_paper: https://arxiv.org/abs/1611.05431 +research_paper_title: Aggregated Residual Transformations for Deep Neural Networks +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Number of parameters: 25.0M + Model size: 95.4 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: +- Medical Imaging +- Anomaly Detection +- Inventory Management +related_models: +- mobilenet_v2 +- densenet121 +- googlenet +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/resnext50/model.py b/qai_hub_models/models/resnext50/model.py new file mode 100644 index 00000000..40e05f85 --- /dev/null +++ b/qai_hub_models/models/resnext50/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V2" + + +class ResNeXt50(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.resnext50_32x4d(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/resnext50/perf.yaml b/qai_hub_models/models/resnext50/perf.yaml new file mode 100644 index 00000000..7801c578 --- /dev/null +++ b/qai_hub_models/models/resnext50/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: ResNeXt50 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 2118.0 + throughput: 472.14353163361665 + estimated_peak_memory_range: + min: 16384 + max: 2188056 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 77 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 77 + job_id: jep2r94xg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 2068.0 + throughput: 483.55899419729207 + estimated_peak_memory_range: + min: 16384 + max: 67185584 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 125 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 125 + job_id: jqpyojqr5 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:07:32.076107Z' diff --git a/qai_hub_models/models/resnext50/test.py b/qai_hub_models/models/resnext50/test.py new file mode 100644 index 00000000..86169f61 --- /dev/null +++ b/qai_hub_models/models/resnext50/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.resnext50.demo import main as demo_main +from qai_hub_models.models.resnext50.model import MODEL_ID, ResNeXt50 + + +def test_task(): + run_imagenet_classifier_test(ResNeXt50.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(ResNeXt50.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/sam/README.md b/qai_hub_models/models/sam/README.md new file mode 100644 index 00000000..3c30d212 --- /dev/null +++ b/qai_hub_models/models/sam/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Segment-Anything-Model: High-quality segmentation mask generation around any object in an image with simple input prompt](https://aihub.qualcomm.com/models/sam) + +Transformer based encoder-decoder where prompts specify what to segment in an image thereby allowing segmentation without the need for additional training. The image encoder generates embeddings and the lightweight decoder operates on the embeddings for point and mask based image segmentation. + +This is based on the implementation of Segment-Anything-Model found +[here](https://github.com/facebookresearch/segment-anything). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/sam). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[sam]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.sam.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.sam.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Segment-Anything-Model can be found + [here](https://github.com/facebookresearch/segment-anything/blob/main/LICENSE). + + +## References +* [Segment Anything](https://arxiv.org/abs/2304.02643) +* [Source Model Implementation](https://github.com/facebookresearch/segment-anything) diff --git a/qai_hub_models/models/sam/__init__.py b/qai_hub_models/models/sam/__init__.py new file mode 100644 index 00000000..96afc8ed --- /dev/null +++ b/qai_hub_models/models/sam/__init__.py @@ -0,0 +1,3 @@ +from .app import SAMApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import SAMQAIHMWrapper as Model # noqa: F401 diff --git a/qai_hub_models/models/sam/app.py b/qai_hub_models/models/sam/app.py new file mode 100644 index 00000000..32249e5d --- /dev/null +++ b/qai_hub_models/models/sam/app.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +from typing import Tuple, no_type_check + +import numpy as np +import torch + +from qai_hub_models.models.sam.model import SAMQAIHMWrapper + + +class SAMApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with Segment-Anything Model. + + The app uses 2 models: + * encoder (Given input image, emmits image embeddings to be used by decoder) + * decoder (Lightweight decoder, modified to accept and work with fix image size) + + For a given image input, the app will: + * Prepare: Runs encoder on given image and creates and caches embeddings + * Generate masks: Uses cached embeddings and generate masks for given points + """ + + @no_type_check + def __init__(self, model: SAMQAIHMWrapper): + self.orig_img_size = None + self.image_embeddings = None + self.sam_qaihm_wrapper = model + self.sam_encoder = self.sam_qaihm_wrapper.get_sam_encoder() + self.sam_decoder = None + + def prepare(self, input_image: np.ndarray, single_mask_mode=True): + """ + Prepares App for segmentation of given input image + - Pre-processes input image + - Initiate Decoder with input image size + + Parameters: + input_image: np.ndarry + Input RGB image loaded as numpy array. + single_mask_mode: bool + Set decoder to return single mask for given points. + """ + if self.sam_encoder is None: + self.sam_encoder = self.sam_qaihm_wrapper.get_sam_encoder() + + preprocessed_image = self.sam_encoder.preprocess_input_image(input_image) + self.image_embeddings = self.sam_encoder(preprocessed_image) + + # Initialize decoder + self.orig_img_size = input_image.shape[:2] + self.sam_decoder = self.sam_qaihm_wrapper.get_sam_decoder( + self.orig_img_size, single_mask_mode + ) + + def reset(self): + """Reset app state""" + self.image_embeddings = None + self.orig_img_size = None + self.sam_decoder = None + + def preprocess_point_coordinates( + self, input_coords: np.ndarray, image_shape: Tuple[int, int] + ): + """Peprocesses Point coordinates to work with decoder""" + if self.sam_encoder is None: + raise RuntimeError("Encoder is not intialized. Please run `app.prepare`.") + return torch.Tensor( + self.sam_encoder.transforms.apply_coords(input_coords, image_shape) + ) + + def predict(self, *args, **kwargs): + # See generate_mask_from_points. + return self.generate_mask_from_points(*args, **kwargs) + + def generate_mask_from_points( + self, + point_coords: torch.Tensor, + point_labels: torch.Tensor, + ) -> torch.Tensor: + """ + Generate masks from given points + + Parameters: + point_coords: torch.Tensor of shape [k, 2] + Point coordinates from input image for segmentation + point_labels: torch.Tensor of shape [k] + Point Labels to select/de-select given point for segmentation + e.g. Corresponding value is 1 if this point is to be included, otherwise 0 + Returns: + upscaled_masks: torch.Tensor of shape [1, k, ] + score: torch.Tensor of shape [1, k] + masks: torch.Tensor of shape [1, k, 256, 256] + Use this low resolution masks to further slice and upscale for resolutions that Decoder is not intialized to. + + Where, + k = number of points + """ + if self.sam_decoder is None: + raise RuntimeError( + "Please call `prepare_from_image` or `prepare` before calling `segment`." + ) + + # Prepare inputs for decoder + # Preprocess point co-ordinates for decoder + point_coords = self.preprocess_point_coordinates( + np.expand_dims(np.array(point_coords), 0), self.orig_img_size + ) + point_labels = torch.Tensor(point_labels).unsqueeze(0) + mask_input = torch.zeros(self.sam_decoder.get_input_spec()["mask_input"][0]) + has_mask_input = torch.zeros((1,)) + + upscaled_masks, scores, masks = self.sam_decoder( + self.image_embeddings, + point_coords, + point_labels, + mask_input, + has_mask_input, + ) + + # Reduce noise from generated masks + upscaled_masks = self.postprocess_mask(upscaled_masks) + masks = self.postprocess_mask(masks) + + return upscaled_masks, scores, masks + + def postprocess_mask(self, generated_mask: torch.Tensor): + """Drop masks lower than threshold to minimize noise""" + return generated_mask > self.sam_qaihm_wrapper.get_sam().mask_threshold diff --git a/qai_hub_models/models/sam/demo.py b/qai_hub_models/models/sam/demo.py new file mode 100644 index 00000000..91fb063a --- /dev/null +++ b/qai_hub_models/models/sam/demo.py @@ -0,0 +1,92 @@ +import argparse + +import numpy as np + +from qai_hub_models.models.sam.app import SAMApp +from qai_hub_models.models.sam.model import ( + DEFAULT_MODEL_TYPE, + MODEL_ASSET_VERSION, + MODEL_ID, + SMALL_MODEL_TYPE, + SAMQAIHMWrapper, +) +from qai_hub_models.models.sam.utils import show_image +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "truck.jpg" +) + + +# Run SAM end-to-end model on given image. +# The demo will output image with segmentation mask applied for input points +def main(is_test: bool = False): + # Demo parameters + parser = argparse.ArgumentParser() + parser.add_argument( + "--image", + type=str, + default=IMAGE_ADDRESS, + help="image file path or URL", + ) + parser.add_argument( + "--model_type", + type=str, + default=DEFAULT_MODEL_TYPE, + help=f"SAM model type to load. Tested with model type `{DEFAULT_MODEL_TYPE}`.", + ) + parser.add_argument( + "--point_coordinates", + type=str, + default="500,375;", + help="Comma separated x and y coordinate. Multiple coordinate separated by `;`." + " e.g. `x1,y1;x2,y2`. Default: `500,375;`", + ) + parser.add_argument( + "--single_mask_mode", + type=bool, + default=True, + help="If True, returns single mask. For multiple points multiple masks could lead to better results.", + ) + args = parser.parse_args(["--model_type", SMALL_MODEL_TYPE] if is_test else None) + + coordinates = list(filter(None, args.point_coordinates.split(";"))) + + # Load Application + app = SAMApp(SAMQAIHMWrapper.from_pretrained(model_type=args.model_type)) + + # Load Image + image = load_image(args.image) + image_data = np.asarray(image) + + # Prepare SAM for decoder for given input image: + # i.e. run SAM encoder to generate and cache image embeddings + app.prepare(image_data, single_mask_mode=args.single_mask_mode) + + # Point segmentation using decoder + print("\n** Performing point segmentation **\n") + + # Input points + input_coords = [] + input_labels = [] + + for coord in coordinates: + coord_split = coord.split(",") + if len(coord_split) != 2: + raise RuntimeError( + f"Expecting comma separated x and y coordinate. Provided {coord_split}." + ) + + input_coords.append([int(coord_split[0]), int(coord_split[1])]) + # Set label to `1` to include current point for segmentation + input_labels.append(1) + + # Generate masks with given input points + generated_mask, *_ = app.generate_mask_from_points(input_coords, input_labels) + + if not is_test: + show_image(image, generated_mask) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/sam/export.py b/qai_hub_models/models/sam/export.py new file mode 100644 index 00000000..5119bd3f --- /dev/null +++ b/qai_hub_models/models/sam/export.py @@ -0,0 +1,225 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Mapping, Optional, Tuple + +import qai_hub as hub +import torch +from torch.utils.mobile_optimizer import MobileOptimizerType, optimize_for_mobile + +from qai_hub_models.models.sam import Model +from qai_hub_models.utils.args import ( + export_parser, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + +ALL_COMPONENTS = ["SAMDecoder", "SAMEncoder"] +DEFAULT_COMPONENTS = ["SAMDecoder"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[ + str, Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] +] | List[str]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` + + Returns: + A Mapping from component_name to a 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "sam" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + component_arg = components + components = components or DEFAULT_COMPONENTS + for component in components: + if component not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "sam", + "Segment-Anything-Model", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + component_arg, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + components_dict = {} + if "SAMDecoder" in components: + components_dict["SAMDecoder"] = model.get_sam_decoder() + if "SAMEncoder" in components: + components_dict["SAMEncoder"] = model.get_sam_encoder() + + compile_jobs = {} + for component_name, component in components_dict.items(): + # Trace the model + input_spec = component.get_input_spec() + source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + + source_model = optimize_for_mobile( + source_model, + optimization_blocklist={ + MobileOptimizerType.HOIST_CONV_PACKED_PARAMS, + MobileOptimizerType.INSERT_FOLD_PREPACK_OPS, + MobileOptimizerType.CONV_BN_FUSION, + }, + ) + + # 2. Compile the models to an on-device asset + model_compile_options = component.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image --force_channel_last_output output_0", + ) + print(f"Optimizing model {component_name} to run on-device.") + compile_jobs[component_name] = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=f"{component_name}", + options=model_compile_options, + ) + + # 3. Profile the model assets on real devices + profile_jobs = {} + if not skip_profiling: + for component_name in components: + print(f"Profiling model {component_name} on a hosted device.") + profile_jobs[component_name] = hub.submit_profile_job( + model=compile_jobs[component_name].get_target_model(), + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_jobs = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + sample_inputs = components_dict[component_name].sample_inputs() + inference_jobs[component_name] = hub.submit_inference_job( + model=compile_jobs[component_name].get_target_model(), + inputs=sample_inputs, + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 5. Download the model assets to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + for component_name, compile_job in compile_jobs.items(): + target_model = compile_job.get_target_model() + target_model.download( + str(output_path / f"{model_name}_{component_name}.tflite") + ) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + for component_name in components: + inference_job = inference_jobs[component_name] + sample_inputs = components_dict[component_name].sample_inputs() + torch_out = torch_inference(components_dict[component_name], sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return { + component_name: ( + compile_jobs[component_name], + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser( + model_cls=Model, components=ALL_COMPONENTS, supports_qnn=False + ) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/sam/info.yaml b/qai_hub_models/models/sam/info.yaml new file mode 100644 index 00000000..29532e53 --- /dev/null +++ b/qai_hub_models/models/sam/info.yaml @@ -0,0 +1,35 @@ +name: Segment-Anything-Model +# id must match with the model dir name in qai_hub_models +id: sam +status: public +headline: High-quality segmentation mask generation around any object in an + image with simple input prompt. +domain: Computer Vision +use_case: Semantic Segmentation +description: Transformer based encoder-decoder where prompts specify what to + segment in an image thereby allowing segmentation without the need for additional + training. The image encoder generates embeddings and the lightweight decoder + operates on the embeddings for point and mask based image segmentation. +tags: + - foundation +research_paper: https://arxiv.org/abs/2304.02643 +research_paper_title: Segment Anything +license: https://github.com/facebookresearch/segment-anything/blob/main/LICENSE +source_repo: https://github.com/facebookresearch/segment-anything +technical_details: + Number of parameters: 4M + Image decoder model size: 20 MB + Model checkpoint: vit_l + Input resolution: 720p (720x1280) +applicable_scenarios: + - Factory Automation + - Robotic Navigation + - Camera +form_factors: + - Phone + - Tablet +related_models: [] +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +dataset: [] diff --git a/qai_hub_models/models/sam/model.py b/qai_hub_models/models/sam/model.py new file mode 100644 index 00000000..7e59adbe --- /dev/null +++ b/qai_hub_models/models/sam/model.py @@ -0,0 +1,287 @@ +from __future__ import annotations + +import os +import sys +import tempfile +from typing import Callable, Tuple + +import numpy as np +import torch + +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_path, + maybe_clone_git_repo, +) +from qai_hub_models.utils.base_model import BaseModel, CollectionModel +from qai_hub_models.utils.input_spec import InputSpec + +# This is a fork of https://github.com/facebookresearch/segment-anything +# with changes to make the SAM decoder traceable +SAM_SOURCE_REPO = "https://github.com/dmckinnon/segment-anything" +SAM_SOURCE_REPO_COMMIT = "0bc06e062ca883c2524bfa79061807c535eb0d51" +MODEL_ID = __name__.split(".")[-2] +DEFAULT_MODEL_TYPE = "vit_l" +SMALL_MODEL_TYPE = "vit_b" +MODEL_REGISTERY = { + "vit_b": "sam_vit_b_01ec64.pth", # 91M params + "vit_l": "sam_vit_l_0b3195.pth", # 308M params + "vit_h": "sam_vit_h_4b8939.pth", # 636M params +} +MODEL_ASSET_VERSION = 1 + + +class SAMQAIHMWrapper(CollectionModel): + """ + QAIHM version of segment-anything (https://github.com/dmckinnon/segment-anything) + + QAIHM fork modifies following from parent segment-anything repo: + 1. window_partition in encoder works on rank-5 tensor instead of rank-6 tensor + 2. SamOnnxModel accepts `orig_img_size` to use static upsample instead of dynamic upsample + """ + + def __init__( + self, + sam: torch.nn.Module, + sam_encoder: Callable, + SamOnnxModel, + ResizeLongestSide, + SamPredictor, + ): + self.sam = sam + self.sam_encoder = sam_encoder + self.SamOnnxModel = SamOnnxModel + self.ResizeLongestSide = ResizeLongestSide + self.SamPredictor = SamPredictor + + def get_sam(self) -> torch.nn.Module: + return self.sam + + def get_sam_encoder(self) -> Callable: + return self.sam_encoder + + # Create a new decoder + def get_sam_decoder( + self, orig_img_size: Tuple[int, int] = (720, 1280), single_mask_mode=True + ) -> Callable: + self.sam_decoder = SegmentAnythingONNXDecoder( + self, + single_mask_mode=single_mask_mode, + orig_img_size=orig_img_size, + ) + return self.sam_decoder + + @classmethod + def from_pretrained(cls, model_type: str = DEFAULT_MODEL_TYPE) -> SAMQAIHMWrapper: + ( + sam_model_registry, + SamOnnxModel, + ResizeLongestSide, + SamPredictor, + ) = _patch_sam_with_qaihm_modules() + sam = load_sam_model(sam_model_registry, model_type) + sam_encoder = SegmentAnythingEncoder(sam, ResizeLongestSide) + return cls(sam, sam_encoder, SamOnnxModel, ResizeLongestSide, SamPredictor) + + def __call__(self, image: torch.Tensor) -> torch.Tensor: + raise NotImplementedError("Cannot call SAMQAIHMWrapper directly") + + +class SegmentAnythingEncoder(BaseModel): + """Exportable SAM encoder""" + + def __init__( + self, + sam: torch.nn.Module, + ResizeLongestSide: Callable, + ) -> None: + super().__init__() + self.sam = sam + self.transforms = ResizeLongestSide(self.sam.image_encoder.img_size) + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run SAM Image encoder and returns image embeddings + + Parameters: + image: Pixel values pre-procewindow_partitionssed for encoder consumption. + Range: float[0, 255] normalized via preprocess_input_image + 3-channel Color Space: RGB + + Returns: + image_embeddings + """ + return self.sam.image_encoder(image) + + def get_input_spec( + self, + height: int = 720, + width: int = 1280, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + + preprocessed_image = self.preprocess_input_image( + np.ones((height, width, 3), dtype=np.uint8) + ) + return {"image": (preprocessed_image.shape, "float32")} + + def preprocess_input_image(self, input_image: np.ndarray): + """Transform input image to work with SAM encoder""" + transformed_image = torch.as_tensor( + self.transforms.apply_image(input_image) + ).type(torch.float32) + transformed_image = transformed_image.permute(2, 0, 1).contiguous()[ + None, :, :, : + ] + + self.input_size = transformed_image.shape[-2:] + self.original_size = input_image.shape[:2] + return self.sam.preprocess(transformed_image) + + @classmethod + def from_pretrained(cls): + return SAMQAIHMWrapper.from_pretrained().get_sam_encoder() + + +class SegmentAnythingONNXDecoder(BaseModel): + """Exportable SAM decoder""" + + def __init__( + self, + sam_qaihm_wrapper: SAMQAIHMWrapper, + orig_img_size: Tuple[int, int] = (720, 1280), + single_mask_mode: bool = True, + ) -> None: + super().__init__() + self.sam = sam_qaihm_wrapper.get_sam() + self.sam_decoder = sam_qaihm_wrapper.SamOnnxModel( + self.sam, orig_img_size=orig_img_size, return_single_mask=single_mask_mode + ) + self.transforms = sam_qaihm_wrapper.ResizeLongestSide( + self.sam.image_encoder.img_size + ) + + def forward( + self, + image_embeddings: torch.Tensor, + point_coords: torch.Tensor, + point_labels: torch.Tensor, + mask_input: torch.Tensor, + has_mask_input: torch.Tensor, + ) -> torch.Tensor: + """ + Run SAM lightweight decoder and return generated mask for given points + + Parameters: + image_embeddings: torch.Tensor of shape [1, emb_dim, emb_size, emb_size] + Image embeddings generated by Encoder + point_coords: torch.Tensor of shape [1, k, 2] + Point coordinates from input image for segmentation + point_labels: torch.Tensor of shape [1, k] + Point Labels to select/de-select given point for segmentation + e.g. Corresponding value is 1 if this point is to be included, otherwise 0 + mask_input: torch.Tensor of shape [1, 1, 4 * image_emd_size, 4 * image_emb_size] + Input mask to consider for segmentation. If using point based segmentation, set this to torch.zeros() + has_mask_input: torch.Tensor of shape [1] + If has value [1] then mask_input is used, otherwise no. + If using point based segmentation, can set this to [0] + + Returns: + upscaled_masks: torch.Tensor of shape [1, k, ] + score: torch.Tensor of shape [1, k] + masks: torch.Tensor of shape [1, k, 256, 256] + Use this low resolution masks to further slice and upscale for resolutions that Decoder is not intialized to. + + Where, + k = number of points + """ + return self.sam_decoder( + image_embeddings, point_coords, point_labels, mask_input, has_mask_input + ) + + def get_input_spec( + self, + num_of_points=1, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + embed_dim = self.sam.prompt_encoder.embed_dim + embed_size = self.sam.prompt_encoder.image_embedding_size + mask_input_size = [4 * x for x in embed_size] + + input_spec = { + "image_embeddings": ((1, embed_dim, *embed_size), "float32"), + "point_coords": ((1, num_of_points, 2), "float32"), + "point_labels": ((1, num_of_points), "float32"), + "mask_input": ((1, 1, *mask_input_size), "float32"), + "has_mask_input": ((1,), "float32"), + } + return input_spec + + @classmethod + def from_pretrained(cls): + return SAMQAIHMWrapper.from_pretrained().get_sam_decoder() + + +def _get_weights_url(model_type: str = DEFAULT_MODEL_TYPE): + """Convert from names of weights files to the url for the weights file""" + if model_type not in MODEL_REGISTERY.keys(): + raise RuntimeError(f"Weights not found for model type `{model_type}`.") + + return CachedWebModelAsset( + f"https://dl.fbaipublicfiles.com/segment_anything/{MODEL_REGISTERY[model_type]}", + MODEL_ID, + MODEL_ASSET_VERSION, + f"{MODEL_REGISTERY[model_type]}", + ) + + +def load_sam_model( + sam_model_registry, model_type: str = DEFAULT_MODEL_TYPE +) -> torch.nn.Module: + """Loads SAM model of given model type""" + weights_url = _get_weights_url(model_type) + with tempfile.TemporaryDirectory() as tmpdir: + weights_path = load_path(weights_url, tmpdir) + sam = sam_model_registry[model_type](weights_path) + sam.eval() + return sam + + +def _patch_sam_with_qaihm_modules(): + """ + Patches segment-anything with modifications + + Returns: + sam_model_registry: semgment_anything.sam_model_registry + dictionary of str (model_type) to callback to build respective model + SamOnnxModel: torch.nn.Module + light-weight decoder with fix image size + ResizeLongestSide: segment_anything.utils.transforms.ResizeLongestSide + Resizing utility updated to work with input image size + SamPredictor: segment_anything.SamPredictor + Python class wrapper to call image encoder - decoder + """ + sam_repo_path = maybe_clone_git_repo( + SAM_SOURCE_REPO, SAM_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION + ) + cwd = os.getcwd() + try: + # Patch path for this load only + sys.path.insert(0, sam_repo_path) + + # import required modules and utilities + from segment_anything import SamPredictor, sam_model_registry + from segment_anything.utils.onnx import SamOnnxModel + from segment_anything.utils.transforms import ResizeLongestSide + + return sam_model_registry, SamOnnxModel, ResizeLongestSide, SamPredictor + finally: + # Reset global state + os.chdir(cwd) + sys.path.remove(sam_repo_path) diff --git a/qai_hub_models/models/sam/perf.yaml b/qai_hub_models/models/sam/perf.yaml new file mode 100644 index 00000000..a098fa50 --- /dev/null +++ b/qai_hub_models/models/sam/perf.yaml @@ -0,0 +1,92 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + supported_chipsets: [] +models: +- name: SAMDecoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:04:02.192386Z' +- name: SAMEncoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:04:02.360369Z' diff --git a/qai_hub_models/models/sam/requirements.txt b/qai_hub_models/models/sam/requirements.txt new file mode 100644 index 00000000..116e68bc --- /dev/null +++ b/qai_hub_models/models/sam/requirements.txt @@ -0,0 +1,4 @@ +matplotlib +opencv_python +pycocotools +requests diff --git a/qai_hub_models/models/sam/test.py b/qai_hub_models/models/sam/test.py new file mode 100644 index 00000000..378ee779 --- /dev/null +++ b/qai_hub_models/models/sam/test.py @@ -0,0 +1,76 @@ +import numpy as np +import pytest +import torch + +from qai_hub_models.models.sam import App +from qai_hub_models.models.sam.demo import IMAGE_ADDRESS +from qai_hub_models.models.sam.demo import main as demo_main +from qai_hub_models.models.sam.model import SMALL_MODEL_TYPE, SAMQAIHMWrapper +from qai_hub_models.utils.asset_loaders import load_image +from qai_hub_models.utils.testing import skip_clone_repo_check_fixture # noqa: F401 + + +@pytest.fixture(scope="module") +def input_image_data() -> np.ndarray: + return np.asarray(load_image(IMAGE_ADDRESS)) + + +def test_e2e_numerical( + input_image_data: np.ndarray, + monkeypatch, + skip_clone_repo_check_fixture, +): + """Verify our driver produces the correct segmentation as source PyTorch model""" + monkeypatch.setattr("builtins.input", lambda: "y") + + sam_wrapper = SAMQAIHMWrapper.from_pretrained(SMALL_MODEL_TYPE) + sam_model = sam_wrapper.get_sam() + sam_predictor = sam_wrapper.SamPredictor(sam_model) + sam_decoder = sam_wrapper.SamOnnxModel( + sam_model, orig_img_size=input_image_data.shape[:2], return_single_mask=True + ) + + sam_predictor.set_image(input_image_data) + # QAIHM SAMApp for segmentation + sam_app = App(sam_wrapper) + # Prepare image for segmentation + sam_app.prepare(input_image_data) + + # Ensure image embeddings match with source model + np.allclose( + sam_predictor.features.detach().numpy(), + sam_app.image_embeddings.detach().numpy(), + ) + + # + # Verify Decoder output is correct + # + + # Create input for decoder + embed_size = sam_predictor.model.prompt_encoder.image_embedding_size + mask_input_size = [4 * x for x in embed_size] + decoder_inputs = { + "image_embeddings": sam_predictor.features.detach(), + "point_coords": torch.randint(low=0, high=500, size=(1, 2), dtype=torch.float), + "point_labels": torch.randint(low=0, high=4, size=(1,), dtype=torch.float), + "mask_input": torch.zeros(1, 1, *mask_input_size, dtype=torch.float), + "has_mask_input": torch.tensor([1], dtype=torch.float), + } + + # Perform inference for decoder models + obs_decoder_output = sam_app.generate_mask_from_points( + decoder_inputs["point_coords"], + decoder_inputs["point_labels"], + ) + + decoder_inputs["point_coords"] = decoder_inputs["point_coords"].unsqueeze(0) + decoder_inputs["point_labels"] = decoder_inputs["point_labels"].unsqueeze(0) + exp_decoder_output = sam_decoder(*decoder_inputs.values()) + + # Ensure segmentation upscaled mask, scores and low-res masks match with source model + for exp, obs in zip(exp_decoder_output, obs_decoder_output): + np.allclose(exp.detach().numpy(), obs.detach().numpy()) + + +def test_demo(skip_clone_repo_check_fixture): + demo_main(is_test=True) diff --git a/qai_hub_models/models/sam/utils.py b/qai_hub_models/models/sam/utils.py new file mode 100644 index 00000000..c40890d7 --- /dev/null +++ b/qai_hub_models/models/sam/utils.py @@ -0,0 +1,21 @@ +import matplotlib.pyplot as plt +import numpy as np + + +## Helper routines +def show_image(image, masks=None): + """Show input image with mask applied""" + plt.figure(figsize=(10, 10)) + plt.imshow(image) + if masks is not None: + _show_mask(masks, plt.gca()) + plt.axis("off") + plt.show() + + +def _show_mask(mask, ax): + """Helper routine to add mask over existing plot""" + color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6]) + h, w = mask.shape[-2:] + mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) + ax.imshow(mask_image) diff --git a/qai_hub_models/models/sesr_m5/README.md b/qai_hub_models/models/sesr_m5/README.md new file mode 100644 index 00000000..2cad4d05 --- /dev/null +++ b/qai_hub_models/models/sesr_m5/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [SESR-M5: Upscale images in real time](https://aihub.qualcomm.com/models/sesr_m5) + +SESR M5 performs efficient on-device upscaling of images. + +This is based on the implementation of SESR-M5 found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/sesr). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/sesr_m5). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.sesr_m5.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.sesr_m5.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of SESR-M5 can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). + + +## References +* [Collapsible Linear Blocks for Super-Efficient Super Resolution](https://arxiv.org/abs/2103.09404) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/sesr) diff --git a/qai_hub_models/models/sesr_m5/__init__.py b/qai_hub_models/models/sesr_m5/__init__.py new file mode 100644 index 00000000..9dfb18f6 --- /dev/null +++ b/qai_hub_models/models/sesr_m5/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import SESR_M5 as Model # noqa: F401 diff --git a/qai_hub_models/models/sesr_m5/demo.py b/qai_hub_models/models/sesr_m5/demo.py new file mode 100644 index 00000000..066c14b6 --- /dev/null +++ b/qai_hub_models/models/sesr_m5/demo.py @@ -0,0 +1,21 @@ +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.sesr_m5.model import MODEL_ASSET_VERSION, MODEL_ID, SESR_M5 +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "sesr_demo.jpg" +) + + +# Run QuickSRNet end-to-end on a sample image. +# The demo will display an upscaled image +def main(is_test: bool = False): + super_resolution_demo( + model_cls=SESR_M5, + default_image=IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/sesr_m5/export.py b/qai_hub_models/models/sesr_m5/export.py new file mode 100644 index 00000000..125b90bd --- /dev/null +++ b/qai_hub_models/models/sesr_m5/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.sesr_m5 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "sesr_m5" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "sesr_m5", + "SESR-M5", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/sesr_m5/info.yaml b/qai_hub_models/models/sesr_m5/info.yaml new file mode 100644 index 00000000..74877f44 --- /dev/null +++ b/qai_hub_models/models/sesr_m5/info.yaml @@ -0,0 +1,32 @@ +name: SESR-M5 +# id must match with the model dir name in qai_hub_models +id: sesr_m5 +status: public +headline: Upscale images in real time. +domain: Computer Vision +use_case: Super Resolution +description: SESR M5 performs efficient on-device upscaling of images. +tags: [] +research_paper: https://arxiv.org/abs/2103.09404 +research_paper_title: Collapsible Linear Blocks for Super-Efficient Super Resolution +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/sesr +technical_details: + Number of parameters: 343K + Model size: 1.32 MB + Model checkpoint: sesr_m5_4x_checkpoint_float32 + Input resolution: 128x128 +applicable_scenarios: +- Virtual Real Estate Tours +- Gaming +- ARVR +form_factors: +- Phone +- Tablet +related_models: +- xlsr +- esrgan +has_static_banner: yes +has_animated_banner: yes +license_type: other +dataset: [] diff --git a/qai_hub_models/models/sesr_m5/model.py b/qai_hub_models/models/sesr_m5/model.py new file mode 100644 index 00000000..e2f78aa5 --- /dev/null +++ b/qai_hub_models/models/sesr_m5/model.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.evaluators.superres_evaluator import SuperResolutionOutputEvaluator +from qai_hub_models.models._shared.sesr.common import _load_sesr_source_model +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 2 +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/sesr/model/model_cards/sesr_m5_2x_w8a8.json +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_february_artifacts/sesr_m5_2x_checkpoint_float32.pth.tar +SESR_WEIGHTS = "sesr_m5_4x_checkpoint_float32.pth.tar" +SCALING_FACTOR = 4 +NUM_CHANNELS = 16 +NUM_LBLOCKS = 5 + + +class SESR_M5(BaseModel): + """Exportable SESR M5 super resolution model, end-to-end.""" + + def __init__( + self, + sesr_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = sesr_model + + @classmethod + def from_pretrained(cls) -> SESR_M5: + model = _load_sesr_source_model( + MODEL_ID, + MODEL_ASSET_VERSION, + SCALING_FACTOR, + NUM_CHANNELS, + NUM_LBLOCKS, + ) + dst = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, SESR_WEIGHTS + ).fetch() + checkpoint = torch.load(dst, map_location=torch.device("cpu")) + model.load_state_dict(checkpoint["state_dict"]) + model.eval() + + return cls(model) + + def get_evaluator(self) -> BaseEvaluator: + return SuperResolutionOutputEvaluator() + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run SESR M5 on `image`, and produce an upscaled image + + Parameters: + image: Pixel values pre-processed for model consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + image: Pixel values + Range: float[0, 1] + 3-channel Color Space: RGB + """ + + return self.model(image) + + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + height: int = 128, + width: int = 128, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} diff --git a/qai_hub_models/models/sesr_m5/perf.yaml b/qai_hub_models/models/sesr_m5/perf.yaml new file mode 100644 index 00000000..95e47b7b --- /dev/null +++ b/qai_hub_models/models/sesr_m5/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: SESR-M5 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 2214.0 + throughput: 451.6711833785005 + estimated_peak_memory_range: + min: 49152 + max: 8233656 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 22 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 25 + job_id: jz5wl394p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 2149.0 + throughput: 465.33271288971616 + estimated_peak_memory_range: + min: 212992 + max: 77434640 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 32 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 32 + job_id: jmg9zy4mp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:36:38.760826Z' diff --git a/qai_hub_models/models/sesr_m5/test.py b/qai_hub_models/models/sesr_m5/test.py new file mode 100644 index 00000000..5254b6e4 --- /dev/null +++ b/qai_hub_models/models/sesr_m5/test.py @@ -0,0 +1,34 @@ +import numpy as np + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.sesr_m5.demo import IMAGE_ADDRESS +from qai_hub_models.models.sesr_m5.demo import main as demo_main +from qai_hub_models.models.sesr_m5.model import MODEL_ASSET_VERSION, MODEL_ID, SESR_M5 +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_same, skip_clone_repo_check + +OUTPUT_IMAGE_LOCAL_PATH = "sesr_m5_output.png" +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, OUTPUT_IMAGE_LOCAL_PATH +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(IMAGE_ADDRESS) + model = SESR_M5.from_pretrained() + app = SuperResolutionApp(model=model) + output_img = app.upscale_image(image)[0] + + output_img.save("/local/mnt/workspace/sesr_m5_output.png") + + expected_output_image = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_same( + np.asarray(expected_output_image, dtype=np.float32), + np.array(output_img).astype(np.float32), + diff_tol=0.01, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/sesr_m5_quantized/README.md b/qai_hub_models/models/sesr_m5_quantized/README.md new file mode 100644 index 00000000..7b5358b3 --- /dev/null +++ b/qai_hub_models/models/sesr_m5_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [SESR-M5-Quantized: Upscale images in real time](https://aihub.qualcomm.com/models/sesr_m5_quantized) + +SESR M5 performs efficient on-device upscaling of images. + +This is based on the implementation of SESR-M5-Quantized found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/sesr). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/sesr_m5_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.sesr_m5_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.sesr_m5_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of SESR-M5-Quantized can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). + + +## References +* [Collapsible Linear Blocks for Super-Efficient Super Resolution](https://arxiv.org/abs/2103.09404) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/sesr) diff --git a/qai_hub_models/models/sesr_m5_quantized/__init__.py b/qai_hub_models/models/sesr_m5_quantized/__init__.py new file mode 100644 index 00000000..f6e60899 --- /dev/null +++ b/qai_hub_models/models/sesr_m5_quantized/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import SESR_M5Quantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/sesr_m5_quantized/demo.py b/qai_hub_models/models/sesr_m5_quantized/demo.py new file mode 100644 index 00000000..4e8bac0c --- /dev/null +++ b/qai_hub_models/models/sesr_m5_quantized/demo.py @@ -0,0 +1,23 @@ +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.sesr_m5_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + SESR_M5Quantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "sesr_demo.jpg" +) + + +def main(is_test: bool = False): + super_resolution_demo( + SESR_M5Quantizable, + default_image=IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/sesr_m5_quantized/export.py b/qai_hub_models/models/sesr_m5_quantized/export.py new file mode 100644 index 00000000..68276139 --- /dev/null +++ b/qai_hub_models/models/sesr_m5_quantized/export.py @@ -0,0 +1,192 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.sesr_m5_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "sesr_m5_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "sesr_m5_quantized", + "SESR-M5-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/sesr_m5_quantized/info.yaml b/qai_hub_models/models/sesr_m5_quantized/info.yaml new file mode 100644 index 00000000..6e065628 --- /dev/null +++ b/qai_hub_models/models/sesr_m5_quantized/info.yaml @@ -0,0 +1,30 @@ +name: SESR-M5-Quantized +# id must match with the model dir name in qai_hub_models +id: sesr_m5_quantized +status: public +headline: Upscale images in real time. +domain: Computer Vision +use_case: Super Resolution +description: SESR M5 performs efficient on-device upscaling of images. +tags: [quantized] +research_paper: https://arxiv.org/abs/2103.09404 +research_paper_title: Collapsible Linear Blocks for Super-Efficient Super Resolution +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/sesr +technical_details: + Number of parameters: 32.1K + Model size: 46.1 KB + Model checkpoint: sesr_m5_4x_checkpoint_int8 + Input resolution: 128x128 +applicable_scenarios: +- Virtual Real Estate Tours +- Gaming +- ARVR +form_factors: +- Phone +- Tablet +related_models: [xlsr, xlsr_quantized, quicksrnetlarge] +has_static_banner: yes +has_animated_banner: yes +license_type: other +dataset: [] diff --git a/qai_hub_models/models/sesr_m5_quantized/model.py b/qai_hub_models/models/sesr_m5_quantized/model.py new file mode 100644 index 00000000..5dd14bf3 --- /dev/null +++ b/qai_hub_models/models/sesr_m5_quantized/model.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models._shared.sesr.common import _load_sesr_source_model +from qai_hub_models.models.sesr_m5.model import ( + NUM_CHANNELS, + NUM_LBLOCKS, + SCALING_FACTOR, + SESR_M5, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( # isort: skip + AIMETQuantizableMixin, +) + + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 + +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/sesr/model/model_cards/sesr_m5_4x_w8a8.json: +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/sesr_m5_4x_checkpoint_int8.pth +# and +# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js +# Encodings were generated with AIMET QuantSim library +QUANTIZED_WEIGHTS = "sesr_m5_4x_checkpoint_int8.pth" +AIMET_ENCODINGS = "sesr_m5_quantized_encodings.json" +AIMET_CONFIG = "default_config_per_channel.json" + + +class SESR_M5Quantizable(AIMETQuantizableMixin, SESR_M5): + """QuickSRNetLarge with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + sesr_model: QuantizationSimModel, + ) -> None: + SESR_M5.__init__(self, sesr_model.model) + AIMETQuantizableMixin.__init__( + self, sesr_model, needs_onnx_direct_aimet_export=False + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> SESR_M5Quantizable: + # Load Model + sesr = _load_sesr_source_model( + MODEL_ID, MODEL_ASSET_VERSION, SCALING_FACTOR, NUM_CHANNELS, NUM_LBLOCKS + ) + input_shape = SESR_M5.get_input_spec()["image"][0] + equalize_model(sesr, input_shape) + + # Download weights and quantization parameters + weights = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS + ).fetch() + aimet_config = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, AIMET_CONFIG + ).fetch() + + # Load the model weights and quantization parameters + state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"] + # Here we collapse before loading the quantized weights. + # The model is collapsed pre-quantization - see + # https://github.com/quic/aimet-model-zoo/blob/d09d2b0404d10f71a7640a87e9d5e5257b028802/aimet_zoo_torch/common/super_resolution/models.py#L110 + sesr.collapse() + sesr.load_state_dict(state_dict) + sim = QuantizationSimModel( + sesr, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=aimet_config, + dummy_input=torch.rand(input_shape), + ) + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + + return cls(sim) diff --git a/qai_hub_models/models/sesr_m5_quantized/perf.yaml b/qai_hub_models/models/sesr_m5_quantized/perf.yaml new file mode 100644 index 00000000..14a85ccd --- /dev/null +++ b/qai_hub_models/models/sesr_m5_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: SESR-M5-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1743.0 + throughput: 573.7234652897304 + estimated_peak_memory_range: + min: 24576 + max: 2845656 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 13 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 16 + job_id: jz5wl31jp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:34:35.502394Z' diff --git a/qai_hub_models/models/sesr_m5_quantized/test.py b/qai_hub_models/models/sesr_m5_quantized/test.py new file mode 100644 index 00000000..48d2c5cb --- /dev/null +++ b/qai_hub_models/models/sesr_m5_quantized/test.py @@ -0,0 +1,83 @@ +import os +import tempfile +import zipfile + +import numpy as np +import torch + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.sesr_m5_quantized.demo import IMAGE_ADDRESS +from qai_hub_models.models.sesr_m5_quantized.demo import main as demo_main +from qai_hub_models.models.sesr_m5_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + SESR_M5Quantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check + +OUTPUT_IMAGE_LOCAL_PATH = "sesr_m5_quantized_output.png" +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, OUTPUT_IMAGE_LOCAL_PATH +) + + +@skip_clone_repo_check +def test_task(): + # AIMET Quantization Simulator introduces randomness. Eliminate that for this test. + torch.manual_seed(0) + image = load_image(IMAGE_ADDRESS) + model = SESR_M5Quantizable.from_pretrained() + app = SuperResolutionApp(model=model) + app_output_image = app.predict(image)[0] + + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_close( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_trace(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = SuperResolutionApp( + SESR_M5Quantizable.from_pretrained().convert_to_quantized_torchscript() + ) + app_output_image = app.predict(image)[0] + + assert_most_close( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_aimet_export(): + model = SESR_M5Quantizable.from_pretrained() + name = model.__class__.__name__ + with tempfile.TemporaryDirectory() as tmpdir: + output_zip = model.convert_to_onnx_and_aimet_encodings( + tmpdir, + model.get_input_spec(), + ) + assert os.path.exists(output_zip) + with zipfile.ZipFile(output_zip, "r") as zip: + assert f"{name}.aimet/" in zip.namelist() + assert f"{name}.aimet/{name}.encodings" in zip.namelist() + assert f"{name}.aimet/{name}.onnx" in zip.namelist() + assert len(zip.filelist) == 3 + + # No test of torchscipt and aimet encodings due to #8954 + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/shufflenet_v2/README.md b/qai_hub_models/models/shufflenet_v2/README.md new file mode 100644 index 00000000..41addb61 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Shufflenet-v2: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/shufflenet_v2) + +ShufflenetV2 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of Shufflenet-v2 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/shufflenet_v2). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.shufflenet_v2.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.shufflenet_v2.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Shufflenet-v2 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py) diff --git a/qai_hub_models/models/shufflenet_v2/__init__.py b/qai_hub_models/models/shufflenet_v2/__init__.py new file mode 100644 index 00000000..2830d137 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ShufflenetV2 as Model # noqa: F401 diff --git a/qai_hub_models/models/shufflenet_v2/demo.py b/qai_hub_models/models/shufflenet_v2/demo.py new file mode 100644 index 00000000..35e55a8b --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.shufflenet_v2.model import ShufflenetV2 + + +def main(is_test: bool = False): + imagenet_demo(ShufflenetV2, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/shufflenet_v2/export.py b/qai_hub_models/models/shufflenet_v2/export.py new file mode 100644 index 00000000..58523864 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.shufflenet_v2 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "shufflenet_v2" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "shufflenet_v2", + "Shufflenet-v2", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/shufflenet_v2/info.yaml b/qai_hub_models/models/shufflenet_v2/info.yaml new file mode 100644 index 00000000..9887654a --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2/info.yaml @@ -0,0 +1,37 @@ +name: Shufflenet-v2 +# id must match with the model dir name in qai_hub_models +id: shufflenet_v2 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: ShufflenetV2 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: [] +research_paper: https://arxiv.org/abs/1807.11164 +research_paper_title: "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py +technical_details: + Number of parameters: 1.37M + Model size: 5.28 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/shufflenet_v2/model.py b/qai_hub_models/models/shufflenet_v2/model.py new file mode 100644 index 00000000..7b80cdba --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class ShufflenetV2(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.shufflenet_v2_x0_5(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/shufflenet_v2/perf.yaml b/qai_hub_models/models/shufflenet_v2/perf.yaml new file mode 100644 index 00000000..8c57e692 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Shufflenet-v2 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 921.0 + throughput: 1085.7763300760043 + estimated_peak_memory_range: + min: 16384 + max: 2322736 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 202 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 202 + job_id: j1gly27e5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 321.0 + throughput: 3115.264797507788 + estimated_peak_memory_range: + min: 622592 + max: 4181728 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 157 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 157 + job_id: jw568zvvg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:27:51.522582Z' diff --git a/qai_hub_models/models/shufflenet_v2/test.py b/qai_hub_models/models/shufflenet_v2/test.py new file mode 100644 index 00000000..c2fb2111 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.shufflenet_v2.demo import main as demo_main +from qai_hub_models.models.shufflenet_v2.model import MODEL_ID, ShufflenetV2 + + +def test_task(): + run_imagenet_classifier_test(ShufflenetV2.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(ShufflenetV2.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/shufflenet_v2_quantized/README.md b/qai_hub_models/models/shufflenet_v2_quantized/README.md new file mode 100644 index 00000000..0eeda872 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Shufflenet-v2Quantized: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/shufflenet_v2_quantized) + +ShufflenetV2 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of Shufflenet-v2Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/shufflenet_v2_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.shufflenet_v2_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.shufflenet_v2_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Shufflenet-v2Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py) diff --git a/qai_hub_models/models/shufflenet_v2_quantized/__init__.py b/qai_hub_models/models/shufflenet_v2_quantized/__init__.py new file mode 100644 index 00000000..115ea4da --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2_quantized/__init__.py @@ -0,0 +1,7 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) +from qai_hub_models.models.shufflenet_v2_quantized.model import MODEL_ID # noqa: F401 +from qai_hub_models.models.shufflenet_v2_quantized.model import ( # noqa: F401 + ShufflenetV2Quantizable as Model, +) diff --git a/qai_hub_models/models/shufflenet_v2_quantized/demo.py b/qai_hub_models/models/shufflenet_v2_quantized/demo.py new file mode 100644 index 00000000..c8ebd010 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2_quantized/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.shufflenet_v2_quantized.model import ShufflenetV2Quantizable + + +def main(is_test: bool = False): + imagenet_demo(ShufflenetV2Quantizable, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/shufflenet_v2_quantized/export.py b/qai_hub_models/models/shufflenet_v2_quantized/export.py new file mode 100644 index 00000000..12bb9144 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2_quantized/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.shufflenet_v2_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "shufflenet_v2_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "shufflenet_v2_quantized", + "Shufflenet-v2Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/shufflenet_v2_quantized/info.yaml b/qai_hub_models/models/shufflenet_v2_quantized/info.yaml new file mode 100644 index 00000000..ba9ac912 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2_quantized/info.yaml @@ -0,0 +1,38 @@ +name: Shufflenet-v2Quantized +# id must match with the model dir name in qai_hub_models +id: shufflenet_v2_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: ShufflenetV2 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - quantized +research_paper: https://arxiv.org/abs/1807.11164 +research_paper_title: "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py +technical_details: + Number of parameters: 1.37M + Model size: 5.28 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/shufflenet_v2_quantized/model.py b/qai_hub_models/models/shufflenet_v2_quantized/model.py new file mode 100644 index 00000000..9a15cb53 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2_quantized/model.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.shufflenet_v2.model import ShufflenetV2 +from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "shufflenet_v2_quantized_encodings.json" + + +class ShufflenetV2Quantizable(AIMETQuantizableMixin, ShufflenetV2): + """ShufflenetV2 with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + sim_model: QuantizationSimModel, + ) -> None: + ShufflenetV2.__init__(self, sim_model.model) + AIMETQuantizableMixin.__init__( + self, sim_model, needs_onnx_direct_aimet_export=True + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "ShufflenetV2Quantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + model = ShufflenetV2.from_pretrained() + input_shape = model.get_input_spec()["image_tensor"][0] + + equalize_model(model, input_shape) + sim = QuantizationSimModel( + model.net, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_per_channel_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) diff --git a/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml b/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml new file mode 100644 index 00000000..877fce75 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Shufflenet-v2Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 30460.0 + throughput: 32.829940906106366 + estimated_peak_memory_range: + min: 294912 + max: 4752264 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 221 + layers_on_gpu: 0 + layers_on_cpu: 17 + total_layers: 238 + job_id: jnp1nw8kg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 355.0 + throughput: 2816.9014084507044 + estimated_peak_memory_range: + min: 0 + max: 3208840 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 122 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 122 + job_id: jvgddqvkg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:21:57.529965Z' diff --git a/qai_hub_models/models/shufflenet_v2_quantized/test.py b/qai_hub_models/models/shufflenet_v2_quantized/test.py new file mode 100644 index 00000000..5cbd0017 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2_quantized/test.py @@ -0,0 +1,36 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.shufflenet_v2_quantized.demo import main as demo_main +from qai_hub_models.models.shufflenet_v2_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ShufflenetV2Quantizable, +) + + +def test_task(): + run_imagenet_classifier_test( + ShufflenetV2Quantizable.from_pretrained(), + MODEL_ID, + asset_version=MODEL_ASSET_VERSION, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test( + ShufflenetV2Quantizable.from_pretrained(), + diff_tol=0.01, + rtol=0.02, + atol=0.2, + is_quantized=True, + ) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/sinet/README.md b/qai_hub_models/models/sinet/README.md new file mode 100644 index 00000000..660348f9 --- /dev/null +++ b/qai_hub_models/models/sinet/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [SINet: Lightweight portrait segmentation for background removal](https://aihub.qualcomm.com/models/sinet) + +SINet is a machine learning model that is designed to segment people from close-up portrait images in real time. + +This is based on the implementation of SINet found +[here](https://github.com/clovaai/ext_portrait_segmentation). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/sinet). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.sinet.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.sinet.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of SINet can be found + [here](https://github.com/clovaai/ext_portrait_segmentation/blob/master/LICENSE). + + +## References +* [SINet: Extreme Lightweight Portrait Segmentation Networks with Spatial Squeeze Modules and Information Blocking Decoder](https://arxiv.org/abs/1911.09099) +* [Source Model Implementation](https://github.com/clovaai/ext_portrait_segmentation) diff --git a/qai_hub_models/models/sinet/__init__.py b/qai_hub_models/models/sinet/__init__.py new file mode 100644 index 00000000..22a4a8d2 --- /dev/null +++ b/qai_hub_models/models/sinet/__init__.py @@ -0,0 +1,3 @@ +from .app import SINetApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import SINet as Model # noqa: F401 diff --git a/qai_hub_models/models/sinet/app.py b/qai_hub_models/models/sinet/app.py new file mode 100644 index 00000000..df519b02 --- /dev/null +++ b/qai_hub_models/models/sinet/app.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from collections import OrderedDict +from typing import Callable, Tuple + +import numpy as np +import PIL +import torch +from PIL.Image import Image + + +def preprocess_image(image: Image) -> torch.Tensor: + """ + Preprocesses images to be run through SINet + as prescribed here: + https://github.com/clovaai/ext_portrait_segmentation/blob/9bc1bada1cb7bd17a3a80a2964980f4b4befef5b/etc/Visualize_webCam.py#L100C1-L109C53 + + Parameters: + image: Input image to be run through the classifier model. + + Returns: + img_tensor: torch tensor 1x3xHxW to be directly passed to the model. + """ + # These mean and std values were computed using the prescribed training data + # and process in https://github.com/clovaai/ext_portrait_segmentation/blob/9bc1bada1cb7bd17a3a80a2964980f4b4befef5b/data/loadData.py#L44 + mean = [113.05697, 120.847824, 133.786] + std = [65.05263, 65.393776, 67.238205] + img_array = np.array(image) + img = img_array.astype(np.float32) + img -= np.array(mean).reshape(1, 1, 3) + img /= np.array(std).reshape(1, 1, 3) + + img /= 255 + img = img.transpose((2, 0, 1)) + img_tensor = torch.from_numpy(img) + img_tensor = torch.unsqueeze(img_tensor, 0) # add a batch dimension + + return img_tensor + + +class SINetApp: + """ + This class consists of light-weight "app code" that is required to + perform end to end inference with SINet. + + For a given image input, the app will: + * Pre-process the image (normalize) + * Run image segmentation + * Convert the raw output into probabilities using softmax + """ + + def __init__(self, model: Callable[[torch.Tensor], OrderedDict]): + self.model = model + + def predict( + self, image: Image, raw_output: bool = False, show_face: bool = True + ) -> Image | Tuple(np.ndarray): + """ + From the provided image or tensor, segment the image + + Parameters: + image: A PIL Image in RGB format of size 224x224. + raw_output: if True, output returned is the raw class predictions per pixel + show_face: if True, image output returned is the background + + Returns: + If raw_output is true, returns: + masks: np.ndarray + a tuple of arrays 1x2xHxW of mask predictions per pixel as 0 or 1 + + Otherwise, returns: + segmented_images: List[PIL.Image] + Image of face segmented out or background segmented out + """ + + input_tensor = preprocess_image(image) + with torch.no_grad(): + output = self.model(input_tensor) + + face_map = (output[0].data.cpu() > 0).numpy()[0] + bg_map = output[0].max(0)[1].byte().data.cpu().numpy() + + if raw_output: + return face_map, bg_map + + idx_fg = face_map == 1 + idx_bg = bg_map == 1 + + img_orig = np.array(image.getdata()).reshape(image.size[0], image.size[1], 3) + + # Display foreground blue-tinted, background red-tinted + seg_img = 0 * img_orig + seg_img[:, :, 0] = ( + img_orig[:, :, 0] * idx_fg * 0.9 + img_orig[:, :, 0] * idx_bg * 0.1 + ) + seg_img[:, :, 1] = ( + img_orig[:, :, 1] * idx_fg * 0.4 + img_orig[:, :, 0] * idx_bg * 0.6 + ) + seg_img[:, :, 2] = ( + img_orig[:, :, 2] * idx_fg * 0.4 + img_orig[:, :, 0] * idx_bg * 0.6 + ) + out_image = PIL.Image.fromarray(seg_img.astype(np.uint8)) + + return out_image diff --git a/qai_hub_models/models/sinet/demo.py b/qai_hub_models/models/sinet/demo.py new file mode 100644 index 00000000..15191447 --- /dev/null +++ b/qai_hub_models/models/sinet/demo.py @@ -0,0 +1,42 @@ +from qai_hub_models.models.sinet.app import SINetApp +from qai_hub_models.models.sinet.model import MODEL_ASSET_VERSION, MODEL_ID, SINet +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + +INPUT_IMAGE_LOCAL_PATH = "sinet_demo.png" +INPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, INPUT_IMAGE_LOCAL_PATH +) + + +def main(is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(SINet) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=INPUT_IMAGE_ADDRESS, + help="image file path or URL.", + ) + args = parser.parse_args([] if is_test else None) + model = demo_model_from_cli_args(SINet, args) + validate_on_device_demo_args(args, SINet.get_model_id()) + + # load image and model + image = load_image(args.image) + input_image = image.convert("RGB") + app = SINetApp(model) + output = app.predict(input_image, False, False) + if not is_test: + display_or_save_image(output, args.output_dir, "sinet_demo_output.png") + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/sinet/export.py b/qai_hub_models/models/sinet/export.py new file mode 100644 index 00000000..2a8420e2 --- /dev/null +++ b/qai_hub_models/models/sinet/export.py @@ -0,0 +1,193 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.sinet import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "sinet" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "sinet", + "SINet", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/sinet/info.yaml b/qai_hub_models/models/sinet/info.yaml new file mode 100644 index 00000000..bcef64cd --- /dev/null +++ b/qai_hub_models/models/sinet/info.yaml @@ -0,0 +1,35 @@ +name: SINet +# id must match with the model dir name in qai_hub_models +id: sinet +status: public +headline: Lightweight portrait segmentation for background removal. +domain: Computer Vision +use_case: Semantic Segmentation +description: SINet is a machine learning model that is designed to segment people + from close-up portrait images in real time. +tags: [] +research_paper: https://arxiv.org/abs/1911.09099 +research_paper_title: 'SINet: Extreme Lightweight Portrait Segmentation Networks with + Spatial Squeeze Modules and Information Blocking Decoder' +license: https://github.com/clovaai/ext_portrait_segmentation/blob/master/LICENSE +source_repo: https://github.com/clovaai/ext_portrait_segmentation +technical_details: + Number of parameters: 86.9K + Model size: 428 KB + Model checkpoint: SINet.pth + Input resolution: 224x224 +applicable_scenarios: +- Background replacement +- Face removal +related_models: +- fcn_resnet50 +- unet_segmentation +- mediapipe_selfie +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: no +license_type: other +dataset: [] diff --git a/qai_hub_models/models/sinet/model.py b/qai_hub_models/models/sinet/model.py new file mode 100644 index 00000000..380734af --- /dev/null +++ b/qai_hub_models/models/sinet/model.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +import os + +import torch + +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + SourceAsRoot, + load_torch, +) +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +SINET_SOURCE_REPOSITORY = "https://github.com/clovaai/ext_portrait_segmentation" +SINET_SOURCE_REPO_COMMIT = "9bc1bada1cb7bd17a3a80a2964980f4b4befef5b" +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_WEIGHTS = "SINet.pth" +NUM_CLASSES = 2 + + +class SINet(BaseModel): + """Exportable SINet portrait segmentation application, end-to-end.""" + + def __init__( + self, + sinet_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = sinet_model + + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> SINet: + sinet_model = _load_sinet_source_model_from_weights(weights) + + return cls(sinet_model.eval()) + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run SINet on `image`, and produce a tensor of classes for segmentation + + Parameters: + image: Pixel values pre-processed for model consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + tensor: 1x2xHxW tensor of class logits per pixel + """ + return self.model(image) + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 224, + width: int = 224, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} + + +def _get_weightsfile_from_name(weights_name: str = DEFAULT_WEIGHTS): + """Convert from names of weights files to the url for the weights file""" + if weights_name == DEFAULT_WEIGHTS: + return CachedWebModelAsset( + "https://github.com/clovaai/ext_portrait_segmentation/raw/master/result/SINet/SINet.pth", + MODEL_ID, + MODEL_ASSET_VERSION, + "SINet.pth", + ) + else: + raise NotImplementedError(f"Cannot get weights file from name {weights_name}") + + +def _load_sinet_source_model_from_weights( + weights_name_or_path: str, +) -> torch.nn.Module: + with SourceAsRoot( + SINET_SOURCE_REPOSITORY, SINET_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION + ): + if os.path.exists(os.path.expanduser(weights_name_or_path)): + weights_path = os.path.expanduser(weights_name_or_path) + else: + if not os.path.exists(weights_name_or_path): + # Load SINet model from the source repository using the given weights. + weights_path = _get_weightsfile_from_name(weights_name_or_path) + else: + weights_path = None + weights = load_torch(weights_path or weights_name_or_path) + + # Perform a find and replace for .data.size() in SINet's shuffle implementation + # as tracing treats this as a constant, but does not treat .shape as a constant + with open("models/SINet.py", "r") as file: + file_content = file.read() + new_content = file_content.replace(".data.size()", ".shape") + with open("models/SINet.py", "w") as file: + file.write(new_content) + + # import the model arch + from models.SINet import SINet + + # This config is copied from the main function in Sinet.py: + # https://github.com/clovaai/ext_portrait_segmentation/blob/9bc1bada1cb7bd17a3a80a2964980f4b4befef5b/models/SINet.py#L557 + config = [ + [[3, 1], [5, 1]], + [[3, 1], [3, 1]], + [[3, 1], [5, 1]], + [[3, 1], [3, 1]], + [[5, 1], [3, 2]], + [[5, 2], [3, 4]], + [[3, 1], [3, 1]], + [[5, 1], [5, 1]], + [[3, 2], [3, 4]], + [[3, 1], [5, 2]], + ] + + sinet_model = SINet(classes=2, p=2, q=8, config=config, chnn=1) + sinet_model.load_state_dict(weights, strict=True) + + return sinet_model diff --git a/qai_hub_models/models/sinet/perf.yaml b/qai_hub_models/models/sinet/perf.yaml new file mode 100644 index 00000000..936c588b --- /dev/null +++ b/qai_hub_models/models/sinet/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: SINet + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1817.0 + throughput: 550.357732526142 + estimated_peak_memory_range: + min: 434176 + max: 2872792 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 240 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 240 + job_id: jegnzmkmg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1192.0 + throughput: 838.9261744966443 + estimated_peak_memory_range: + min: 622592 + max: 51366312 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 187 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 187 + job_id: joprl2wep + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:11:37.141843Z' diff --git a/qai_hub_models/models/sinet/test.py b/qai_hub_models/models/sinet/test.py new file mode 100644 index 00000000..f6252e6c --- /dev/null +++ b/qai_hub_models/models/sinet/test.py @@ -0,0 +1,32 @@ +import numpy as np + +from qai_hub_models.models.sinet.app import SINetApp +from qai_hub_models.models.sinet.demo import INPUT_IMAGE_ADDRESS +from qai_hub_models.models.sinet.demo import main as demo_main +from qai_hub_models.models.sinet.model import MODEL_ASSET_VERSION, MODEL_ID, SINet +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_LOCAL_PATH = "sinet_demo_output.png" +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, OUTPUT_IMAGE_LOCAL_PATH +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(INPUT_IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = SINetApp(SINet.from_pretrained()) + app_output_image = app.predict(image, False) + + np.testing.assert_allclose( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + rtol=0.02, + atol=0.2, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/squeezenet1_1/README.md b/qai_hub_models/models/squeezenet1_1/README.md new file mode 100644 index 00000000..1c424ec0 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [SqueezeNet-1_1: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/squeezenet1_1) + +SqueezeNet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of SqueezeNet-1_1 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/squeezenet1_1). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.squeezenet1_1.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.squeezenet1_1.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of SqueezeNet-1_1 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size](https://arxiv.org/abs/1602.07360) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py) diff --git a/qai_hub_models/models/squeezenet1_1/__init__.py b/qai_hub_models/models/squeezenet1_1/__init__.py new file mode 100644 index 00000000..aba29d7a --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import SqueezeNet as Model # noqa: F401 diff --git a/qai_hub_models/models/squeezenet1_1/demo.py b/qai_hub_models/models/squeezenet1_1/demo.py new file mode 100644 index 00000000..d90387f9 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.squeezenet1_1.model import SqueezeNet + + +def main(is_test: bool = False): + imagenet_demo(SqueezeNet, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/squeezenet1_1/export.py b/qai_hub_models/models/squeezenet1_1/export.py new file mode 100644 index 00000000..b672df92 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.squeezenet1_1 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "squeezenet1_1" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "squeezenet1_1", + "SqueezeNet-1_1", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/squeezenet1_1/info.yaml b/qai_hub_models/models/squeezenet1_1/info.yaml new file mode 100644 index 00000000..771c26e0 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1/info.yaml @@ -0,0 +1,37 @@ +name: SqueezeNet-1_1 +# id must match with the model dir name in qai_hub_models +id: squeezenet1_1 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: SqueezeNet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone +research_paper: https://arxiv.org/abs/1602.07360 +research_paper_title: "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size" +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py +technical_details: + Number of parameters: 1.24M + Model size: 4.73 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Gaming + - Robotics +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: no +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/squeezenet1_1/model.py b/qai_hub_models/models/squeezenet1_1/model.py new file mode 100644 index 00000000..cb74591d --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = "squeezenet1_1" +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class SqueezeNet(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.squeezenet1_1(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/squeezenet1_1/perf.yaml b/qai_hub_models/models/squeezenet1_1/perf.yaml new file mode 100644 index 00000000..6fe797b1 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: SqueezeNet-1_1 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 212.0 + throughput: 4716.981132075472 + estimated_peak_memory_range: + min: 20480 + max: 1439360 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 39 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 39 + job_id: j1pvlre75 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 280.0 + throughput: 3571.4285714285716 + estimated_peak_memory_range: + min: 20480 + max: 12471928 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 69 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 69 + job_id: j7gjr2o7p + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:17:05.340427Z' diff --git a/qai_hub_models/models/squeezenet1_1/test.py b/qai_hub_models/models/squeezenet1_1/test.py new file mode 100644 index 00000000..96704f82 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.squeezenet1_1.demo import main as demo_main +from qai_hub_models.models.squeezenet1_1.model import MODEL_ID, SqueezeNet + + +def test_task(): + run_imagenet_classifier_test(SqueezeNet.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(SqueezeNet.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/squeezenet1_1_quantized/README.md b/qai_hub_models/models/squeezenet1_1_quantized/README.md new file mode 100644 index 00000000..4829dd66 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [SqueezeNet-1_1Quantized: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/squeezenet1_1_quantized) + +SqueezeNet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of SqueezeNet-1_1Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/squeezenet1_1_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.squeezenet1_1_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.squeezenet1_1_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of SqueezeNet-1_1Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size](https://arxiv.org/abs/1602.07360) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py) diff --git a/qai_hub_models/models/squeezenet1_1_quantized/__init__.py b/qai_hub_models/models/squeezenet1_1_quantized/__init__.py new file mode 100644 index 00000000..ee27918c --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1_quantized/__init__.py @@ -0,0 +1,7 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) +from qai_hub_models.models.squeezenet1_1_quantized.model import MODEL_ID # noqa: F401 +from qai_hub_models.models.squeezenet1_1_quantized.model import ( # noqa: F401 + SqueezeNetQuantizable as Model, +) diff --git a/qai_hub_models/models/squeezenet1_1_quantized/demo.py b/qai_hub_models/models/squeezenet1_1_quantized/demo.py new file mode 100644 index 00000000..feb221b3 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1_quantized/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.squeezenet1_1_quantized.model import SqueezeNetQuantizable + + +def main(is_test: bool = False): + imagenet_demo(SqueezeNetQuantizable, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/squeezenet1_1_quantized/export.py b/qai_hub_models/models/squeezenet1_1_quantized/export.py new file mode 100644 index 00000000..fefdac34 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1_quantized/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.squeezenet1_1_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "squeezenet1_1_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "squeezenet1_1_quantized", + "SqueezeNet-1_1Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/squeezenet1_1_quantized/info.yaml b/qai_hub_models/models/squeezenet1_1_quantized/info.yaml new file mode 100644 index 00000000..05b4ed84 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1_quantized/info.yaml @@ -0,0 +1,38 @@ +name: SqueezeNet-1_1Quantized +# id must match with the model dir name in qai_hub_models +id: squeezenet1_1_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: SqueezeNet is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone + - quantized +research_paper: https://arxiv.org/abs/1602.07360 +research_paper_title: "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size" +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py +technical_details: + Number of parameters: 1.24M + Model size: 4.73 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Gaming + - Robotics +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: no +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/squeezenet1_1_quantized/model.py b/qai_hub_models/models/squeezenet1_1_quantized/model.py new file mode 100644 index 00000000..9b5d0cb3 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1_quantized/model.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.squeezenet1_1.model import SqueezeNet +from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "squeezenet1_1_quantized_encodings.json" + + +class SqueezeNetQuantizable(AIMETQuantizableMixin, SqueezeNet): + """SqueezeNet with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + sim_model: QuantizationSimModel, + ) -> None: + SqueezeNet.__init__(self, sim_model.model) + AIMETQuantizableMixin.__init__( + self, sim_model, needs_onnx_direct_aimet_export=True + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "SqueezeNetQuantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + model = SqueezeNet.from_pretrained() + input_shape = model.get_input_spec()["image_tensor"][0] + + equalize_model(model, input_shape) + sim = QuantizationSimModel( + model.net, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_per_channel_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) diff --git a/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml b/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml new file mode 100644 index 00000000..00894c2f --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: SqueezeNet-1_1Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 215.0 + throughput: 4651.162790697675 + estimated_peak_memory_range: + min: 20480 + max: 1657648 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 43 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 43 + job_id: jegnzmovg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 227.0 + throughput: 4405.286343612334 + estimated_peak_memory_range: + min: 622592 + max: 62441592 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 45 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 45 + job_id: joprl2ovp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:29:43.800896Z' diff --git a/qai_hub_models/models/squeezenet1_1_quantized/test.py b/qai_hub_models/models/squeezenet1_1_quantized/test.py new file mode 100644 index 00000000..47fb51e4 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1_quantized/test.py @@ -0,0 +1,36 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.squeezenet1_1_quantized.demo import main as demo_main +from qai_hub_models.models.squeezenet1_1_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + SqueezeNetQuantizable, +) + + +def test_task(): + run_imagenet_classifier_test( + SqueezeNetQuantizable.from_pretrained(), + MODEL_ID, + asset_version=MODEL_ASSET_VERSION, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test( + SqueezeNetQuantizable.from_pretrained(), + diff_tol=0.01, + rtol=0.02, + atol=0.2, + is_quantized=True, + ) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/stable_diffusion/__init__.py b/qai_hub_models/models/stable_diffusion/__init__.py new file mode 100644 index 00000000..77de1e8a --- /dev/null +++ b/qai_hub_models/models/stable_diffusion/__init__.py @@ -0,0 +1,5 @@ +from .app import StableDiffusionApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import SDModel as Model # noqa: F401 +from .model import SDTextEncoder as EncoderModel # noqa: F401 +from .model import SDUNet as DecoderModel # noqa: F401 diff --git a/qai_hub_models/models/stable_diffusion/app.py b/qai_hub_models/models/stable_diffusion/app.py new file mode 100644 index 00000000..a36d1618 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion/app.py @@ -0,0 +1,220 @@ +import gc + +import diffusers +import numpy as np +import torch +import transformers +from tqdm import tqdm + +from qai_hub_models.models.stable_diffusion.model import ( + SDTextEncoder, + SDUNet, + SDVAEDecoder, +) + + +class StableDiffusionApp: + """ + StableDiffusionApp represents the application code needed to string + together the various neural networks that make up the Stable Diffusion + algorithm. This code is written in Python and uses PyTorch and is meant to + serve as a reference implementation for application in other languages and + for other platforms. + + Please run the app via `demo.py`. + + References + ---------- + * https://arxiv.org/abs/2112.10752 + * https://github.com/apple/ml-stable-diffusion + """ + + def __init__( + self, + text_encoder: SDTextEncoder, + vae_decoder: SDVAEDecoder, + unet: SDUNet, + tokenizer: transformers.SpecialTokensMixin, + scheduler: diffusers.SchedulerMixin, + ): + + self.text_encoder = text_encoder + self.vae_decoder = vae_decoder + self.unet = unet + self.tokenizer = tokenizer + self.scheduler = scheduler + + def _encode_text_prompt( + self, prompt: str, do_classifier_free_guidance: bool = False + ) -> torch.Tensor: + """ + Takes a text prompt and returns a tensor with its text embedding. + + Parameters + ---------- + prompt : str + The text prompt to encode. + do_classifier_free_guidance : bool + Whether to use classifier-free guidance. If True, the returned text + embedding will be a batch of two, with the unconditional and the + conditional embeddings. + """ + # Tokenize + text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ) + + # Embed using the text encoder neural network + text_embeddings, _ = self.text_encoder(text_input.input_ids) + + if do_classifier_free_guidance: + # Unconditional prompt is simply an empty string + uncond_prompt = "" + + # Tokenize + uncond_input = self.tokenizer( + uncond_prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + + # Embed using the text encoder neural network + uncond_embeddings, _ = self.text_encoder(uncond_input.input_ids) + + # The text embeddings becomes a batch of two + text_embeddings = torch.cat([uncond_embeddings, text_embeddings], dim=0) + + # Transpose to (batch_size, embedding_size, 1, sequence_length) + text_embeddings = text_embeddings.permute(0, 2, 1).unsqueeze(2) + + return text_embeddings + + def predict(self, *args, **kwargs): + # See generate_image. + return self.generate_image(*args, **kwargs) + + def generate_image( + self, + prompt: str, + num_steps: int = 50, + seed: int = 0, + guidance_scale: float = 7.5, + ) -> torch.Tensor: + """ + Generate an image using the PyTorch reference neural networks. This + code can be used as a reference for how to glue together the neural + networks in an application. Note that this code relies on a tokenizer + and scheduler from the HuggingFace's diffusers library, so those would + have to be ported to the application as well. + + Parameters + ---------- + prompt : str + The text prompt to generate an image from. + num_steps : int + The number of steps to run the diffusion process for. Higher value + may lead to better image quality. + seed : int + The seed to use for the random number generator. + guidance_scale : float + Classifier-free guidance is a method that allows us to control how + strongly the image generation is guided by the prompt. This is done + by always processing two samples at once: an unconditional (using a + text embedding of an empty prompt) and a conditional (using a text + embedding of the provided prompt). Given the noise prediction of + both of these, we linearly interpolate between them based on the + guidance_scale. A guidance scale of 0 is the same as using an empty + prompt. A guidance scale of 1 turns off classifier-free guidance + and is computationally less expensive since it only processes one + sample at a time. Intuitively you may think the rest of guidance + scales are between 0 and 1, but it is common to use a scale greater + than 1 as a method of amplifying the prompt's influence on the + image, pushing it further away from the unconditional sample. + + Returns + ------- + torch.Tensor + The generated image in RGB scaled in [0, 1] with tensor shape (H, + W, 3). The height and the width may depend on the underlying Stable + Diffusion version, but is typically 512x512. + """ + + # Determine if need dual samples + do_classifier_free_guidance = guidance_scale != 1.0 + + # Encode text prompt + text_embeddings = self._encode_text_prompt( + prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + ) + + # Set up time steps + self.scheduler.set_timesteps(num_steps) + timesteps = self.scheduler.timesteps + + # Randomly generate initial noise (latents) based on random seed + # We generate the random in numpy and not torch to be consistent with + # the reference implementation. + num_channels_latents = self.unet.in_channels + latents_shape = (1, num_channels_latents, self.unet.height, self.unet.width) + rng = np.random.RandomState(seed) + latents = rng.normal( + scale=self.scheduler.init_noise_sigma, size=latents_shape + ).astype(np.float32) + latents = torch.from_numpy(latents) + + # Set up progress bar + tqdm_context = tqdm( + enumerate(timesteps), + total=len(timesteps), + desc="Generating image", + colour="magenta", + ) + + # Main denoising loop + for _, t in tqdm_context: + # For classifier free guidance, make a copy of the latent vector + latent_model_input = torch.tile( + latents, (2 if do_classifier_free_guidance else 1, 1, 1, 1) + ) + + # Scale the latent vector based on the current timestep + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # Predict the noise residual using the UNet denoiser + (noise_pred,) = self.unet( + latent_model_input, + torch.tensor([t, t], dtype=torch.float32), + text_embeddings, + ) + + # If using classifier-free guidance, interpolate between the + # unconditional and conditional samples based on the guidance scale + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = torch.split(noise_pred, 1, dim=0) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_text - noise_pred_uncond + ) + + # Denoise the latents based on the noise prediction + latents = self.scheduler.step( + noise_pred, + t, + latents, + ).prev_sample + + gc.collect() + + # Rescale latents and decode into RGB image + latents *= 1 / 0.18215 + image = self.vae_decoder(latents) + + # Rescale image to [0, 1] and permute to (height, width, 3) + image = torch.clip(image / 2.0 + 0.5, 0, 1) + image = image.squeeze(0).permute(1, 2, 0) + return image diff --git a/qai_hub_models/models/stable_diffusion/code-gen.yaml b/qai_hub_models/models/stable_diffusion/code-gen.yaml new file mode 100644 index 00000000..d981eed2 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion/code-gen.yaml @@ -0,0 +1,11 @@ +components: + StableDiffusionTextEncoder: model.text_encoder + StableDiffusionVAEDecoder: model.vae_decoder + StableDiffusionUNet: model.unet + StableDiffusonVAEEncoder: model.vae_encoder +default_components: + - text_encoder + - vae_decoder + - unet +tflite_export_failure_reason: "Consumes too much RAM and crashes CI machine." +qnn_export_failure_reason: "Compilation fails https://dev.aihub.qualcomm.com/jobs/j7gjqldv5 (VAE decoder) and https://dev.aihub.qualcomm.com/jobs/jz5w49wmg (UNet)" diff --git a/qai_hub_models/models/stable_diffusion/demo.py b/qai_hub_models/models/stable_diffusion/demo.py new file mode 100644 index 00000000..3bef2358 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion/demo.py @@ -0,0 +1,110 @@ +import argparse + +import numpy as np +from diffusers import StableDiffusionPipeline +from PIL import Image + +from qai_hub_models.models.stable_diffusion.app import StableDiffusionApp +from qai_hub_models.models.stable_diffusion.model import ( + DEFAULT_VERSION, + SDTextEncoder, + SDUNet, + SDVAEDecoder, +) +from qai_hub_models.utils.args import add_output_dir_arg +from qai_hub_models.utils.display import display_or_save_image + +DEFAULT_DEMO_PROMPT = "a high-quality photo of a surfing dog" + + +# Run Stable Diffuison end-to-end on a given prompt. The demo will output an +# AI-generated image based on the description in the prompt. +def main(is_test: bool = False): + parser = argparse.ArgumentParser() + parser.add_argument( + "--prompt", + default=DEFAULT_DEMO_PROMPT, + help="Prompt to generate image from.", + ) + parser.add_argument( + "--model-version", + default=DEFAULT_VERSION, + help="Pre-trained checkpoint and configuration. For available checkpoints: https://huggingface.co/models?search=stable-diffusion.", + ) + parser.add_argument( + "--num-steps", + default=50, + type=int, + help="The number of diffusion iteration steps (higher means better quality).", + ) + parser.add_argument( + "--seed", + default=0, + type=int, + help="Random seed.", + ) + add_output_dir_arg(parser) + parser.add_argument( + "--guidance-scale", + type=float, + default=7.5, + help="Strength of guidance (higher means more influence from prompt).", + ) + args = parser.parse_args([] if is_test else None) + + # Load components + + # Load model with weights from HuggingFace + pipe = StableDiffusionPipeline.from_pretrained( + args.model_version, use_auth_token=True + ) + + # Construct all the networks + text_encoder = SDTextEncoder(pipe).eval() + vae_decoder = SDVAEDecoder(pipe).eval() + unet = SDUNet(pipe).eval() + + # Save the tokenizer and scheduler + tokenizer = pipe.tokenizer + scheduler = pipe.scheduler + + # Load Application + app = StableDiffusionApp( + text_encoder=text_encoder, + vae_decoder=vae_decoder, + unet=unet, + tokenizer=tokenizer, + scheduler=scheduler, + ) + + if not is_test: + print() + print("** Performing image generation with Stable Diffusion **") + print() + print("Prompt:", args.prompt) + print("Model:", args.model_version) + print("Number of steps:", args.num_steps) + print("Guidance scale:", args.guidance_scale) + print("Seed:", args.seed) + print() + print( + "Note: This reference demo uses significant amounts of memory and may take a few minutes to run." + ) + print() + + # Generate image + image = app.generate_image( + args.prompt, + num_steps=args.num_steps, + seed=args.seed, + guidance_scale=args.guidance_scale, + ) + + pil_img = Image.fromarray(np.round(image.detach().numpy() * 255).astype(np.uint8)) + + if not is_test: + display_or_save_image(pil_img, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stable_diffusion/export.py b/qai_hub_models/models/stable_diffusion/export.py new file mode 100644 index 00000000..6813cc8b --- /dev/null +++ b/qai_hub_models/models/stable_diffusion/export.py @@ -0,0 +1,222 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Mapping, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.stable_diffusion import Model +from qai_hub_models.utils.args import ( + export_parser, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + +ALL_COMPONENTS = [ + "StableDiffusionTextEncoder", + "StableDiffusionVAEDecoder", + "StableDiffusionUNet", + "StableDiffusonVAEEncoder", +] +DEFAULT_COMPONENTS = ["text_encoder", "vae_decoder", "unet"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[ + str, Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] +] | List[str]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` + + Returns: + A Mapping from component_name to a 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "stable_diffusion" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + component_arg = components + components = components or DEFAULT_COMPONENTS + for component in components: + if component not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "stable_diffusion", + "no_info_yaml_found", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + component_arg, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + components_dict = {} + if "StableDiffusionTextEncoder" in components: + components_dict["StableDiffusionTextEncoder"] = model.text_encoder + if "StableDiffusionVAEDecoder" in components: + components_dict["StableDiffusionVAEDecoder"] = model.vae_decoder + if "StableDiffusionUNet" in components: + components_dict["StableDiffusionUNet"] = model.unet + if "StableDiffusonVAEEncoder" in components: + components_dict["StableDiffusonVAEEncoder"] = model.vae_encoder + + compile_jobs = {} + for component_name, component in components_dict.items(): + # Trace the model + input_spec = component.get_input_spec() + source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + + # 2. Compile the models to an on-device asset + model_compile_options = component.get_hub_compile_options( + target_runtime, compile_options + ) + print(f"Optimizing model {component_name} to run on-device.") + compile_jobs[component_name] = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=f"{component_name}", + options=model_compile_options, + ) + + # 3. Profile the model assets on real devices + profile_jobs = {} + if not skip_profiling: + for component_name in components: + print(f"Profiling model {component_name} on a hosted device.") + profile_jobs[component_name] = hub.submit_profile_job( + model=compile_jobs[component_name].get_target_model(), + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_jobs = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + sample_inputs = components_dict[component_name].sample_inputs() + inference_jobs[component_name] = hub.submit_inference_job( + model=compile_jobs[component_name].get_target_model(), + inputs=sample_inputs, + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 5. Download the model assets to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + for component_name, compile_job in compile_jobs.items(): + target_model = compile_job.get_target_model() + target_model.download( + str(output_path / f"{model_name}_{component_name}.tflite") + ) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + for component_name in components: + inference_job = inference_jobs[component_name] + sample_inputs = components_dict[component_name].sample_inputs() + torch_out = torch_inference(components_dict[component_name], sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return { + component_name: ( + compile_jobs[component_name], + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser( + model_cls=Model, components=ALL_COMPONENTS, supports_qnn=False + ) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stable_diffusion/model.py b/qai_hub_models/models/stable_diffusion/model.py new file mode 100644 index 00000000..e4827952 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion/model.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +from typing import Any, Optional + +from diffusers import StableDiffusionPipeline + +from qai_hub_models.utils.asset_loaders import SourceAsRoot +from qai_hub_models.utils.base_model import BaseModel, CollectionModel +from qai_hub_models.utils.input_spec import InputSpec + +SD_SOURCE_REPO = "https://github.com/apple/ml-stable-diffusion.git" +SD_SOURCE_REPO_COMMIT = "b392a0aca09a8321c8955ee84b48e9e9fdb49c93" +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_VERSION = "CompVis/stable-diffusion-v1-4" + + +class SDTextEncoder(BaseModel): + """ + Model that encodes the text prompt into a latent tensor. + """ + + def __init__(self, pipe): + super().__init__() + self.text_encoder_sequence_length = pipe.tokenizer.model_max_length + self.vocab_size = pipe.tokenizer.vocab_size + self.text_encoder = pipe.text_encoder + + def get_input_spec(self) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for + # this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + + return {"input_ids": ((1, self.text_encoder_sequence_length), "int32")} + + def forward(self, input_ids): + return self.text_encoder(input_ids, return_dict=False) + + @classmethod + def from_pretrained(cls): + return SDModel.from_pretrained().text_encoder + + +class SDVAEEncoder(BaseModel): + """ + Model that encodes the image into the low-resolution latent space (the + domain of the UNet denoiser). This is not needed for the basic demo which + generates only guided by a text prompt. + """ + + def __init__(self, pipe): + super().__init__() + self.quant_conv = pipe.vae.quant_conv + self.encoder = pipe.vae.encoder + self.vae_scale = 8 + self.height = pipe.unet.config.sample_size * self.vae_scale + self.width = pipe.unet.config.sample_size * self.vae_scale + + def get_input_spec(self) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for + # this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + + return {"image": ((1, 3, self.height, self.width), "float32")} + + def forward(self, z): + return self.quant_conv(self.encoder(z)) + + @classmethod + def from_pretrained(cls): + return SDModel.from_pretrained().vae_encoder + + +class SDVAEDecoder(BaseModel): + """ + Model that decodes the image from the low-resolution latent space (the + domain of the UNet denoiser). + """ + + def __init__(self, pipe): + super().__init__() + self.post_quant_conv = pipe.vae.post_quant_conv + self.decoder = pipe.vae.decoder + self.latent_channels = pipe.vae.config.latent_channels + self.height = pipe.unet.config.sample_size + self.width = pipe.unet.config.sample_size + + def get_input_spec(self) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for + # this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + + return {"z": ((1, self.latent_channels, self.height, self.width), "float32")} + + def forward(self, z): + return self.decoder(self.post_quant_conv(z)) + + @classmethod + def from_pretrained(cls): + return SDModel.from_pretrained().vae_decoder + + +class SDUNet(BaseModel): + """ + UNet is the core of the Stable Diffusion denoiser. It is a U-Net-style + denoiser that operates on a lower-resolution embedded space. It is the only + model that runs repeatedly during the generation of an image, so + performance of this model is the most critical. + + Unlike the other models, this model does not use the HuggingFace model + directly and instead uses a version developed by Apple from + https://github.com/apple/ml-stable-diffusion. + """ + + def __init__(self, pipe, do_classifier_free_guidance: bool = True): + super().__init__() + + # Load unet package + unet = _load_apple_sd_package() + + # Construct UNet and load state dictionary + self.unet = unet.UNet2DConditionModel(**pipe.unet.config) + self.unet.load_state_dict(pipe.unet.state_dict()) + + # Configuration variables + self.batch_size = 2 if do_classifier_free_guidance else 1 + self.in_channels = pipe.unet.config.in_channels + self.height = pipe.unet.config.sample_size + self.width = pipe.unet.config.sample_size + + # Input shapes + self.sample_shape = ( + self.batch_size, + self.in_channels, + self.height, + self.width, + ) + self.timestep_shape = (self.batch_size,) + self.encoder_hidden_states_shape = ( + self.batch_size, + pipe.text_encoder.config.hidden_size, + 1, + pipe.text_encoder.config.max_position_embeddings, + ) + + def get_input_spec(self) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for + # this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + + return { + "sample": (self.sample_shape, "float32"), + "timestep": (self.timestep_shape, "float32"), + "encoder_hidden_states": (self.encoder_hidden_states_shape, "float32"), + } + + def forward(self, *args): + return self.unet(*args) + + @classmethod + def from_pretrained(cls): + return SDModel.from_pretrained().unet + + +def _load_apple_sd_package() -> Any: + """ + Imports and returns the Apple the Stable Diffusion package. + + Returns: + unet: The package where the UNet model is defined. + """ + with SourceAsRoot( + SD_SOURCE_REPO, SD_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION + ): + # import required modules and utilities + from python_coreml_stable_diffusion import unet + + return unet + + +class SDModel(CollectionModel): + """Wrapper class containing all the components to run stable diffusion.""" + + def __init__( + self, + text_encoder: SDTextEncoder, + vae_decoder: SDVAEDecoder, + unet: SDUNet, + vae_encoder: Optional[SDVAEEncoder] = None, + ): + self.text_encoder = text_encoder + self.vae_decoder = vae_decoder + self.unet = unet + self.vae_encoder = vae_encoder + + @classmethod + def from_pretrained(cls, model_version: str = DEFAULT_VERSION): + pipe = StableDiffusionPipeline.from_pretrained( + model_version, use_auth_token=True + ) + return cls( + text_encoder=SDTextEncoder(pipe).eval(), + vae_decoder=SDVAEDecoder(pipe).eval(), + unet=SDUNet(pipe).eval(), + vae_encoder=SDVAEEncoder(pipe).eval(), + ) diff --git a/qai_hub_models/models/stable_diffusion/requirements.txt b/qai_hub_models/models/stable_diffusion/requirements.txt new file mode 100644 index 00000000..3f91a5d8 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion/requirements.txt @@ -0,0 +1,3 @@ +transformers==4.31.0 +coremltools==6.2 +diffusers[torch]==0.21.4 diff --git a/qai_hub_models/models/stable_diffusion/test.py b/qai_hub_models/models/stable_diffusion/test.py new file mode 100644 index 00000000..922908dd --- /dev/null +++ b/qai_hub_models/models/stable_diffusion/test.py @@ -0,0 +1,66 @@ +import numpy as np +import pytest +from diffusers import StableDiffusionPipeline + +from qai_hub_models.models.stable_diffusion.app import StableDiffusionApp +from qai_hub_models.models.stable_diffusion.demo import DEFAULT_DEMO_PROMPT +from qai_hub_models.models.stable_diffusion.demo import main as demo_main +from qai_hub_models.models.stable_diffusion.model import ( + DEFAULT_VERSION, + MODEL_ASSET_VERSION, + MODEL_ID, + SDTextEncoder, + SDUNet, + SDVAEDecoder, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, + MODEL_ASSET_VERSION, + "CompVis-v1-4/a-high-quality-photo-of-a-surfing-dog-pytorch-seed42-steps2.png", +) + + +@pytest.mark.skip(reason="Uses a large amount of memory and is often killed by OOM.") +@skip_clone_repo_check +def test_e2e_numerical(): + """ + Verify our PyTorch driver produces the correct image. + """ + # Not sufficient for a sensible image, but enough for a test. + num_steps = 2 + seed = 42 + + pipe = StableDiffusionPipeline.from_pretrained(DEFAULT_VERSION, use_auth_token=True) + + # Construct all the networks + text_encoder = SDTextEncoder(pipe).eval() + vae_decoder = SDVAEDecoder(pipe).eval() + unet = SDUNet(pipe).eval() + + # Save the tokenizer and scheduler + tokenizer = pipe.tokenizer + scheduler = pipe.scheduler + + app = StableDiffusionApp( + text_encoder=text_encoder, + vae_decoder=vae_decoder, + unet=unet, + tokenizer=tokenizer, + scheduler=scheduler, + ) + + ref_image_pil = load_image(IMAGE_ADDRESS) + ref_image_np = np.array(ref_image_pil).astype(np.float32) / 255.0 + + image = app.generate_image(DEFAULT_DEMO_PROMPT, num_steps=num_steps, seed=seed) + + np.allclose(image.detach().numpy(), ref_image_np) + + +@pytest.mark.skip(reason="Uses a large amount of memory and is often killed by OOM.") +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/stable_diffusion/test_generated.py b/qai_hub_models/models/stable_diffusion/test_generated.py new file mode 100644 index 00000000..7dfe0d0c --- /dev/null +++ b/qai_hub_models/models/stable_diffusion/test_generated.py @@ -0,0 +1,121 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +import os +from unittest import mock + +import pytest +import qai_hub as hub +import yaml + +from qai_hub_models.models.stable_diffusion.export import export_model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.mark.skip(reason="Consumes too much RAM and crashes CI machine.") +@pytest.mark.compile +@skip_clone_repo_check +def test_compile_tflite(): + results = export_model( + skip_downloading=True, + skip_profiling=True, + skip_inferencing=True, + dst_runtime="TFLITE", + ) + for component_name, result in results.items(): + compile_job = result[0] + if os.environ.get("TEST_HUB_ASYNC", 0): + with open(os.environ["COMPILE_JOBS_FILE"], "a") as f: + f.write( + f"stable_diffusion_TFLITE_{component_name}: {compile_job.job_id}\n" + ) + else: + result = compile_job.wait() + assert result.success + + +@pytest.mark.skip( + reason="Compilation fails https://dev.aihub.qualcomm.com/jobs/j7gjqldv5 (VAE decoder) and https://dev.aihub.qualcomm.com/jobs/jz5w49wmg (UNet)" +) +@pytest.mark.compile +@skip_clone_repo_check +def test_compile_qnn(): + results = export_model( + skip_downloading=True, + skip_profiling=True, + skip_inferencing=True, + dst_runtime="QNN", + ) + for component_name, result in results.items(): + compile_job = result[0] + if os.environ.get("TEST_HUB_ASYNC", 0): + with open(os.environ["COMPILE_JOBS_FILE"], "a") as f: + f.write( + f"stable_diffusion_QNN_{component_name}: {compile_job.job_id}\n" + ) + else: + result = compile_job.wait() + assert result.success + + +@pytest.mark.skip(reason="Consumes too much RAM and crashes CI machine.") +@pytest.mark.profile +@skip_clone_repo_check +def test_profile_tflite(): + if os.environ.get("TEST_HUB_ASYNC", 0): + with open(os.environ["COMPILE_JOBS_FILE"], "r") as f: + job_ids = yaml.safe_load(f.read()) + job_list = [] + for i in job_ids.keys(): + if i.startswith("stable_diffusion_TFLITE"): + job_list.append(hub.get_job(job_ids[i])) + hub.submit_compile_job = mock.Mock(side_effect=job_list) + results = export_model( + skip_downloading=True, + skip_profiling=False, + skip_inferencing=True, + skip_summary=True, + dst_runtime="TFLITE", + ) + for component_name, result in results.items(): + profile_job = result[1] + if os.environ.get("TEST_HUB_ASYNC", 0): + with open(os.environ["PROFILE_JOBS_FILE"], "a") as f: + f.write( + f"stable_diffusion_TFLITE_{component_name}: {profile_job.job_id}\n" + ) + else: + result = profile_job.wait() + assert result.success + + +@pytest.mark.skip( + reason="Compilation fails https://dev.aihub.qualcomm.com/jobs/j7gjqldv5 (VAE decoder) and https://dev.aihub.qualcomm.com/jobs/jz5w49wmg (UNet)" +) +@pytest.mark.profile +@skip_clone_repo_check +def test_profile_qnn(): + if os.environ.get("TEST_HUB_ASYNC", 0): + with open(os.environ["COMPILE_JOBS_FILE"], "r") as f: + job_ids = yaml.safe_load(f.read()) + job_list = [] + for i in job_ids.keys(): + if i.startswith("stable_diffusion_QNN"): + job_list.append(hub.get_job(job_ids[i])) + hub.submit_compile_job = mock.Mock(side_effect=job_list) + results = export_model( + skip_downloading=True, + skip_profiling=False, + skip_inferencing=True, + skip_summary=True, + dst_runtime="QNN", + ) + for component_name, result in results.items(): + profile_job = result[1] + if os.environ.get("TEST_HUB_ASYNC", 0): + with open(os.environ["PROFILE_JOBS_FILE"], "a") as f: + f.write( + f"stable_diffusion_QNN_{component_name}: {profile_job.job_id}\n" + ) + else: + result = profile_job.wait() + assert result.success diff --git a/qai_hub_models/models/stable_diffusion_quantized/README.md b/qai_hub_models/models/stable_diffusion_quantized/README.md new file mode 100644 index 00000000..4bd0f303 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_quantized/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Stable-Diffusion: State-of-the-art generative AI model used to generate detailed images conditioned on text descriptions](https://aihub.qualcomm.com/models/stable_diffusion_quantized) + +Generates high resolution images from text prompts using a latent diffusion model. This model uses CLIP ViT-L/14 as text encoder, U-Net based latent denoising, and VAE based decoder to generate the final image. + +This is based on the implementation of Stable-Diffusion found +[here](https://github.com/CompVis/stable-diffusion/tree/main). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/stable_diffusion_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[stable_diffusion_quantized]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.stable_diffusion_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.stable_diffusion_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Stable-Diffusion can be found + [here](https://github.com/CompVis/stable-diffusion/blob/main/LICENSE). + + +## References +* [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) +* [Source Model Implementation](https://github.com/CompVis/stable-diffusion/tree/main) diff --git a/qai_hub_models/models/stable_diffusion_quantized/__init__.py b/qai_hub_models/models/stable_diffusion_quantized/__init__.py new file mode 100644 index 00000000..9631f75e --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_quantized/__init__.py @@ -0,0 +1,8 @@ +from qai_hub_models.models.stable_diffusion_quantized.model import ( # noqa: F401 + MODEL_ID, +) +from qai_hub_models.models.stable_diffusion_quantized.model import ( # noqa: F401 + StableDiffusionQuantized as Model, +) + +from .app import StableDiffusionApp as App # noqa: F401 diff --git a/qai_hub_models/models/stable_diffusion_quantized/app.py b/qai_hub_models/models/stable_diffusion_quantized/app.py new file mode 100644 index 00000000..2652f096 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_quantized/app.py @@ -0,0 +1,201 @@ +from typing import Any, Callable, Tuple + +import torch +from diffusers.models.embeddings import get_timestep_embedding + +OUT_H, OUT_W = 512, 512 + + +class StableDiffusionApp: + """ + StableDiffusionApp represents the application code needed to string + together the various neural networks that make up the Stable Diffusion + algorithm. This code is written in Python and uses PyTorch and is meant to + serve as a reference implementation for application in other languages and + for other platforms. + + Please run the app via `demo.py`. + + References + ---------- + * https://arxiv.org/abs/2112.10752 + * https://github.com/apple/ml-stable-diffusion + """ + + def __init__( + self, + text_encoder: Callable[..., Tuple[torch.Tensor, ...]], + vae_decoder: Callable[..., Tuple[torch.Tensor, ...]], + unet: Callable[..., Tuple[torch.Tensor, ...]], + tokenizer: Any, + scheduler: Any, + time_embedding: Any, + ): + """ + Initializes StableDiffusionApp with required neural networks for end-to-end pipeline. + + Parameters + ---------- + text_encoder: + Encoder input text + vae_decoder: + Decoder to decode latent space into output image + unet: + Denoises image in latent space + tokenizer: + Tokenizer for input text. + Output of Tokenizer is fed to text_encoder. + One can experiments with different tokenizers available based on Clip-ViT. + scheduler: + Solver for diffusion steps. + Updates latent space during each iteration. + time_embedding: + Projects time-step into embedding used during denoising in latent space. + """ + + self.text_encoder = text_encoder + self.vae_decoder = vae_decoder + self.unet = unet + self.tokenizer = tokenizer + self.scheduler = scheduler + self.time_embedding = time_embedding + + def get_time_embedding(self, timestep): + timestep = torch.tensor([timestep]) + t_emb = get_timestep_embedding(timestep, 320, True, 0) + emb = self.time_embedding(t_emb) + + return emb + + def _encode_text_prompt(self, prompt: str) -> torch.Tensor: + """ + Takes a text prompt and returns a tensor with its text embedding. + + Parameters + ---------- + prompt: The text prompt to encode. + """ + # Tokenize input prompt + text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ) + + # Tokenize empty prompt + max_length = text_input.input_ids.shape[-1] + uncond_input = self.tokenizer( + [""], + padding="max_length", + max_length=max_length, + return_tensors="pt", + ) + + # Embed using the text encoder neural network + # Encode input and empty prompt in one go + print(f"Extracting embeddings (inference on TextEncoder)\n{'-' * 50}") + embeddings = self.text_encoder( + [ + text_input.input_ids.type(torch.int32), + uncond_input.input_ids.type(torch.int32), + ] + ) + cond_embeddings, uncond_embeddings = torch.split(embeddings, 1, 0) + return cond_embeddings, uncond_embeddings + + def predict(self, *args, **kwargs): + # See generate_image. + return self.generate_image(*args, **kwargs) + + def generate_image( + self, + prompt: str, + num_steps: int = 50, + seed: int = 0, + guidance_scale: float = 7.5, + ) -> torch.Tensor: + """ + Generate an image using the PyTorch reference neural networks. This + code can be used as a reference for how to glue together the neural + networks in an application. Note that this code relies on a tokenizer + and scheduler from the HuggingFace's diffusers library, so those would + have to be ported to the application as well. + + Parameters + ---------- + prompt: + The text prompt to generate an image from. + num_steps: + The number of steps to run the diffusion process for. Higher value + may lead to better image quality. + seed: + The seed to use for the random number generator. + guidance_scale: + Classifier-free guidance is a method that allows us to control how + strongly the image generation is guided by the prompt. This is done + by always processing two samples at once: an unconditional (using a + text embedding of an empty prompt) and a conditional (using a text + embedding of the provided prompt). Given the noise prediction of + both of these, we linearly interpolate between them based on the + guidance_scale. A guidance scale of 0 is the same as using an empty + prompt. A guidance scale of 1 turns off classifier-free guidance + and is computationally less expensive since it only processes one + sample at a time. Intuitively you may think the rest of guidance + scales are between 0 and 1, but it is common to use a scale greater + than 1 as a method of amplifying the prompt's influence on the + image, pushing it further away from the unconditional sample. + + Returns + ------- + torch.Tensor + The generated image in RGB scaled in [0, 1] with tensor shape (H, + W, 3). The height and the width may depend on the underlying Stable + Diffusion version, but is typically 512x512. + """ + + # Encode text prompt + cond_embeddings, uncond_embeddings = self._encode_text_prompt(prompt) + self.scheduler.set_timesteps(num_steps) + self.scheduler.config.prediction_type = "epsilon" + + # Channel last input + latents_shape = (1, 4, OUT_H // 8, OUT_W // 8) + + generator = torch.manual_seed(seed) + latents = torch.randn(latents_shape, generator=generator) + + latents = latents * self.scheduler.init_noise_sigma + + # Helper method to go back and forth from channel-first to channel-last + def _make_channel_last_torch(input_tensor): + return torch.permute(input_tensor, [0, 2, 3, 1]) + + def _make_channel_first_torch(input_tensor): + return torch.permute(torch.Tensor(input_tensor), [0, 3, 1, 2]) + + for i, t in enumerate(self.scheduler.timesteps): + print(f"Step: {i + 1}\n{'-' * 20}") + time_emb = self.get_time_embedding(t) + latent_model_input = self.scheduler.scale_model_input(latents, t) + latent_model_input = _make_channel_last_torch(latent_model_input) + + print(f"Denoising image in latent space (inference on UNet)\n{'-' * 50}") + # Denoise image in latent space + noise = self.unet( + [latent_model_input, latent_model_input], + [time_emb, time_emb], + [cond_embeddings, uncond_embeddings], + ) + + noise_cond, noise_uncond = torch.split(noise, 1, 0) + noise_pred = noise_uncond + guidance_scale * (noise_cond - noise_uncond) + + noise_pred = _make_channel_first_torch(noise_pred) + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + print(f"Decoding generated image (inference on VAEDecoder)\n{'-' * 50}") + # Decode generated image from latent space + latents_vae = _make_channel_last_torch(latents) + image = self.vae_decoder(latents_vae) + return image diff --git a/qai_hub_models/models/stable_diffusion_quantized/demo.py b/qai_hub_models/models/stable_diffusion_quantized/demo.py new file mode 100644 index 00000000..c9b0e1fa --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_quantized/demo.py @@ -0,0 +1,145 @@ +import argparse + +import numpy as np +import qai_hub as hub +from diffusers import DPMSolverMultistepScheduler, UNet2DConditionModel +from PIL import Image +from transformers import CLIPTokenizer + +from qai_hub_models.models.stable_diffusion_quantized.app import StableDiffusionApp +from qai_hub_models.models.stable_diffusion_quantized.model import ( + ClipVITTextEncoder, + Unet, + VAEDecoder, +) +from qai_hub_models.utils.args import add_output_dir_arg +from qai_hub_models.utils.base_model import BasePrecompiledModel +from qai_hub_models.utils.display import display_or_save_image +from qai_hub_models.utils.inference import HubModel +from qai_hub_models.utils.qai_hub_helpers import can_access_qualcomm_ai_hub + +DEFAULT_DEMO_PROMPT = "spectacular view of northern lights from Alaska" +DEFAULT_DEVICE_NAME = "Samsung Galaxy S23 Ultra" + + +def _get_hub_model(input_model: BasePrecompiledModel, device_name=DEFAULT_DEVICE_NAME): + if not can_access_qualcomm_ai_hub(): + raise RuntimeError( + "Stable-diffusion on-device demo requires access to QAI-Hub.\n" + "Please visit https://aihub.qualcomm.com/ and sign-up." + ) + # Upload model + uploaded_model = hub.upload_model(input_model.get_target_model_path()) + inputs = list(input_model.get_input_spec().keys()) + return HubModel(uploaded_model, inputs, hub.Device(name=device_name)) + + +# Run Stable Diffuison end-to-end on a given prompt. The demo will output an +# AI-generated image based on the description in the prompt. +def main(is_test: bool = False): + parser = argparse.ArgumentParser() + parser.add_argument( + "--prompt", + default=DEFAULT_DEMO_PROMPT, + help="Prompt to generate image from.", + ) + parser.add_argument( + "--num-steps", + default=5, + type=int, + help="The number of diffusion iteration steps (higher means better quality).", + ) + parser.add_argument( + "--seed", + default=0, + type=int, + help="Random seed.", + ) + add_output_dir_arg(parser) + parser.add_argument( + "--guidance-scale", + type=float, + default=7.5, + help="Strength of guidance (higher means more influence from prompt).", + ) + parser.add_argument( + "--device-name", + type=str, + default=DEFAULT_DEVICE_NAME, + help="Device to run stable-diffusion demo on.", + ) + args = parser.parse_args([] if is_test else None) + + if not is_test: + print(f"\n{'-' * 100}") + print( + f"** Performing image generation on-device({args.device_name}) with Stable Diffusion **" + ) + print() + print("Prompt:", args.prompt) + print("Number of steps:", args.num_steps) + print("Guidance scale:", args.guidance_scale) + print("Seed:", args.seed) + print() + print( + "Note: This reference demo uses significant amounts of memory and may take a few minutes to run." + ) + print(f"{'-' * 100}\n") + + print(f"Downloading model assets\n{'-' * 50}") + # Load target models + text_encoder = ClipVITTextEncoder.from_precompiled() + unet = Unet.from_precompiled() + vae_decoder = VAEDecoder.from_precompiled() + + # Create three HubModel instances to prepare for on-device inference. + # This is similar to initializing PyTorch model to call forward method later. + # Instead of forward, we later submit inference_jobs on QAI-Hub for + # on-device evaluation. + print(f"Uploading model assets on QAI-Hub\n{'-' * 50}") + text_encoder = _get_hub_model(text_encoder, args.device_name) + unet = _get_hub_model(unet, args.device_name) + vae_decoder = _get_hub_model(vae_decoder, args.device_name) + + # Create tokenizer, scheduler and time_embedding required + # for stable-diffusion pipeline. + tokenizer = CLIPTokenizer.from_pretrained( + "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer", revision="main" + ) + + scheduler = DPMSolverMultistepScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + num_train_timesteps=1000, + ) + + time_embedding = UNet2DConditionModel.from_pretrained( + "runwayml/stable-diffusion-v1-5", subfolder="unet" + ).time_embedding + # Load Application + app = StableDiffusionApp( + text_encoder=text_encoder, + vae_decoder=vae_decoder, + unet=unet, + tokenizer=tokenizer, + scheduler=scheduler, + time_embedding=time_embedding, + ) + + # Generate image + image = app.generate_image( + args.prompt, + num_steps=args.num_steps, + seed=args.seed, + guidance_scale=args.guidance_scale, + ) + + pil_img = Image.fromarray(np.round(image.numpy() * 255).astype(np.uint8)[0]) + + if not is_test: + display_or_save_image(pil_img, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stable_diffusion_quantized/export.py b/qai_hub_models/models/stable_diffusion_quantized/export.py new file mode 100644 index 00000000..43cf9005 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_quantized/export.py @@ -0,0 +1,166 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import warnings +from pathlib import Path +from typing import List, Mapping, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.stable_diffusion_quantized import Model +from qai_hub_models.utils.args import TargetRuntime, export_parser +from qai_hub_models.utils.printing import print_profile_metrics_from_job +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + +ALL_COMPONENTS = ["text_encoder", "unet", "vae_decoder"] +DEFAULT_COMPONENTS = ["text_encoder", "vae_decoder", "unet"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[str, Tuple[Optional[hub.ProfileJob], Optional[hub.InferenceJob]]] | List[ + str +]: + """ + This function accomplishes 5 main tasks: + + 1. Initialize model. + 2. Upload model assets to hub. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Summarizes the results from profiling. + + Each of the last three steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_summary: If set, skips waiting for and summarizing results + from profiling. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_precompiled` + + Returns: + A Mapping from component_name to a 2-tuple of: + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "stable_diffusion_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + component_arg = components + components = components or DEFAULT_COMPONENTS + for component in components: + if component not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "stable_diffusion_quantized", + "Stable-Diffusion", + device, + skip_profiling, + skip_inferencing, + False, + skip_summary, + output_path, + TargetRuntime.QNN, + "", + profile_options, + component_arg, + ) + + # 1. Initialize model + print("Initializing model class") + model = Model.from_precompiled() + components_dict = {} + if "text_encoder" in components: + components_dict["text_encoder"] = model.text_encoder + if "unet" in components: + components_dict["unet"] = model.unet + if "vae_decoder" in components: + components_dict["vae_decoder"] = model.vae_decoder + + # 2. Upload model assets to hub + print("Uploading model assets on hub") + uploaded_models = {} + for component_name in components: + uploaded_models[component_name] = hub.upload_model( + components_dict[component_name].get_target_model_path() + ) + + # 3. Profile the model assets on real devices + profile_jobs = {} + if not skip_profiling: + for component_name in components: + print(f"Profiling model {component_name} on a hosted device.") + profile_jobs[component_name] = hub.submit_profile_job( + model=uploaded_models[component_name], + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_jobs = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + sample_inputs = components_dict[component_name].sample_inputs() + inference_jobs[component_name] = hub.submit_inference_job( + model=uploaded_models[component_name], + inputs=sample_inputs, + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 5. Summarize the results from profiling + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + return { + component_name: ( + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser( + model_cls=Model, components=ALL_COMPONENTS, exporting_compiled_model=True + ) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stable_diffusion_quantized/info.yaml b/qai_hub_models/models/stable_diffusion_quantized/info.yaml new file mode 100644 index 00000000..6864af06 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_quantized/info.yaml @@ -0,0 +1,37 @@ +name: Stable-Diffusion +id: stable_diffusion_quantized +status: public +headline: State-of-the-art generative AI model used to generate + detailed images conditioned on text descriptions. +domain: Generative AI +description: Generates high resolution images from text prompts using a latent + diffusion model. This model uses CLIP ViT-L/14 as text encoder, U-Net based + latent denoising, and VAE based decoder to generate the final image. +use_case: Image Generation +tags: + - generative-ai + - quantized +research_paper: https://arxiv.org/abs/2112.10752 +research_paper_title: "High-Resolution Image Synthesis with Latent Diffusion Models" +license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE +source_repo: https://github.com/CompVis/stable-diffusion/tree/main +technical_details: + Text Encoder Number of parameters: 340M + UNet Number of parameters: 865M + VAE Decoder Number of parameters: 83M + Model size: 1GB + Input: Text prompt to generate image + QNN-SDK: "2.19" +applicable_scenarios: + - Image Generation + - Image Editing + - Content Creation +related_models: + - controlnet_quantized +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: creativeml-openrail-m +dataset: [] diff --git a/qai_hub_models/models/stable_diffusion_quantized/model.py b/qai_hub_models/models/stable_diffusion_quantized/model.py new file mode 100644 index 00000000..0abe2416 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_quantized/model.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import os + +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import BasePrecompiledModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +QNN_SDK_PREFIX = "QNN219" +TEXT_ENCODER = os.path.join(QNN_SDK_PREFIX, "text_encoder.serialized.bin") +UNET_DIFFUSER = os.path.join(QNN_SDK_PREFIX, "unet.serialized.bin") +VAE_DECODER = os.path.join(QNN_SDK_PREFIX, "vae_decoder.serialized.bin") + + +class StableDiffusionQuantized: + """ + Stable Diffusion wrapper class consists of + - Text Encoder + - UNet based diffuser + - VAE decoder + + All three models are pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, text_encoder, unet, vae_decoder) -> None: + self.text_encoder = text_encoder + self.unet = unet + self.vae_decoder = vae_decoder + + @classmethod + def from_precompiled(cls) -> "StableDiffusionQuantized": + return StableDiffusionQuantized( + text_encoder=ClipVITTextEncoder.from_precompiled(), + unet=Unet.from_precompiled(), + vae_decoder=VAEDecoder.from_precompiled(), + ) + + +class ClipVITTextEncoder(BasePrecompiledModel): + """ + CLIP-ViT based Text Encoder. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, target_model_path) -> None: + self.target_model_path = target_model_path + + @classmethod + def from_precompiled(cls) -> "ClipVITTextEncoder": + text_encoder_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, TEXT_ENCODER + ).fetch() + return ClipVITTextEncoder(text_encoder_path) + + def get_target_model_path(self) -> str: + return self.target_model_path + + def get_input_spec(self) -> InputSpec: + return {"input_1": ((1, 77), "int32")} + + +class Unet(BasePrecompiledModel): + """ + UNet model to denoise image in latent space. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, target_model_path) -> None: + self.target_model_path = target_model_path + + @classmethod + def from_precompiled(cls) -> "Unet": + model_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, UNET_DIFFUSER + ).fetch() + return Unet(model_path) + + def get_target_model_path(self) -> str: + return self.target_model_path + + def get_input_spec(self) -> InputSpec: + return { + "input_1": ((1, 64, 64, 4), "float32"), + "input_2": ((1, 1280), "float32"), + "input_3": ((1, 77, 768), "float32"), + } + + +class VAEDecoder(BasePrecompiledModel): + """ + Decodes image from latent into output generated image. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, target_model_path) -> None: + self.target_model_path = target_model_path + + @classmethod + def from_precompiled(cls) -> "VAEDecoder": + model_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, VAE_DECODER + ).fetch() + return VAEDecoder(model_path) + + def get_target_model_path(self) -> str: + return self.target_model_path + + def get_input_spec(self) -> InputSpec: + return {"input_1": ((1, 64, 64, 4), "float32")} diff --git a/qai_hub_models/models/stable_diffusion_quantized/perf.yaml b/qai_hub_models/models/stable_diffusion_quantized/perf.yaml new file mode 100644 index 00000000..36014f84 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_quantized/perf.yaml @@ -0,0 +1,102 @@ +models: +- name: Text-Encoder-Quantized + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-14T05:29:28.928297Z' + torchscript_onnx_qnn: + inference_time: 11362 + throughput: 88.01 + estimated_peak_memory_range: + min: 53248 + max: 44039432 + layer_info: + layers_on_npu: 570 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 570 + precision: uint16 + primary_compute_unit: NPU + job_id: jo5m87owp + job_status: Passed +- name: VAE-Decoder-Quantized + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-14T05:29:28.928297Z' + torchscript_onnx_qnn: + inference_time: 393878 + throughput: 2.53 + estimated_peak_memory_range: + min: 225280 + max: 11689680 + layer_info: + layers_on_npu: 409 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 409 + precision: uint16 + primary_compute_unit: NPU + job_id: joprwro95 + job_status: Passed +- name: UNet-Quantized + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-14T05:29:28.928297Z' + torchscript_onnx_qnn: + inference_time: 256698 + throughput: 3.89 + estimated_peak_memory_range: + min: 143360 + max: 12844792 + layer_info: + layers_on_npu: 5421 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 5421 + precision: uint16 + primary_compute_unit: NPU + job_id: jegnk4org + job_status: Passed +aggregated: + supported_devices: + - Samsung Galaxy S23 Ultra + supported_oses: + - Android + supported_chipsets: + - Snapdragon® 8 Gen 2 + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-14T05:29:28.928297Z' + torchscript_onnx_qnn: + inference_time: 661938 + throughput: 1.51 + estimated_peak_memory_range: + min: 225280 + max: 44039432 + precision: uint16 + primary_compute_unit: NPU + job_id: "" + job_status: Passed diff --git a/qai_hub_models/models/stable_diffusion_quantized/requirements.txt b/qai_hub_models/models/stable_diffusion_quantized/requirements.txt new file mode 100644 index 00000000..e21d8196 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_quantized/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.31.0 +diffusers[torch]==0.21.4 diff --git a/qai_hub_models/models/stable_diffusion_quantized/test.py b/qai_hub_models/models/stable_diffusion_quantized/test.py new file mode 100644 index 00000000..3542ad76 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_quantized/test.py @@ -0,0 +1,35 @@ +import tempfile + +import pytest + +from qai_hub_models.models.stable_diffusion_quantized.demo import main as demo_main +from qai_hub_models.models.stable_diffusion_quantized.export import export_model + + +@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") +@pytest.mark.slow_cloud +def test_export(): + with tempfile.TemporaryDirectory() as tmpdir: + exported_jobs = export_model( + # Testing text_encoder as it's smallest model in + # Stable-Diffusion pipeline + components=["text_encoder"], + skip_inferencing=True, + skip_downloading=True, + skip_summary=True, + output_dir=tmpdir, + ) + + # NOTE: Not waiting for job to finish + # as it will slow CI down. + # Rather, we should create waiting test and move to nightly. + for jobs in exported_jobs.values(): + profile_job, inference_job = jobs[0], jobs[1] + assert profile_job is not None + assert inference_job is None + + +@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") +@pytest.mark.slow_cloud +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/stylegan2/README.md b/qai_hub_models/models/stylegan2/README.md new file mode 100644 index 00000000..25bdb7e1 --- /dev/null +++ b/qai_hub_models/models/stylegan2/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [StyleGAN2: Generate realistic, randomized images of real classes](https://aihub.qualcomm.com/models/stylegan2) + +StyleGAN2 is a machine learning model that generates realistic images from random input state vectors. + +This is based on the implementation of StyleGAN2 found +[here](https://github.com/NVlabs/stylegan3). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/stylegan2). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[stylegan2]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.stylegan2.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.stylegan2.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of StyleGAN2 can be found + [here](https://github.com/NVlabs/stylegan3/blob/main/LICENSE.txt). + + +## References +* [Analyzing and Improving the Image Quality of StyleGAN](http://arxiv.org/abs/1912.04958) +* [Source Model Implementation](https://github.com/NVlabs/stylegan3) diff --git a/qai_hub_models/models/stylegan2/__init__.py b/qai_hub_models/models/stylegan2/__init__.py new file mode 100644 index 00000000..93a64171 --- /dev/null +++ b/qai_hub_models/models/stylegan2/__init__.py @@ -0,0 +1,3 @@ +from .app import StyleGAN2App as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import StyleGAN2 as Model # noqa: F401 diff --git a/qai_hub_models/models/stylegan2/app.py b/qai_hub_models/models/stylegan2/app.py new file mode 100644 index 00000000..4b1114bc --- /dev/null +++ b/qai_hub_models/models/stylegan2/app.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +from typing import Callable, List + +import numpy as np +import torch +from PIL import Image + +from qai_hub_models.models.stylegan2.model import StyleGAN2 + + +class StyleGAN2App: + def __init__( + self, + model: Callable[[torch.Tensor, torch.Tensor | None], torch.Tensor] + | Callable[[torch.Tensor], torch.Tensor], + output_dims: int = 512, + num_classes: int = 0, + ): + self.model = model + self.output_dims = output_dims + self.num_classes = num_classes + + def generate_random_vec(self, batch_size=1, seed=None) -> torch.Tensor: + if isinstance(self.model, StyleGAN2): + input_spec = self.model.get_input_spec(batch_size) + return torch.from_numpy( + self.model.sample_inputs(input_spec, seed=seed)["image_noise"][0] + ) + return torch.from_numpy( + np.random.RandomState(seed).randn(batch_size, self.output_dims) + ) + + def predict(self, *args, **kwargs): + # See generate_images. + return self.generate_images(*args, **kwargs) + + def generate_images( + self, + image_noise: torch.Tensor | None = None, + class_idx: torch.Tensor | None = None, + raw_output: bool = False, + ) -> torch.Tensor | List[Image.Image]: + """ + Generate an image. + + Inputs: + image_noise: torch.Tensor | None + Random state vector from which images should be generated. + Shape: [N, self.output_dims] + + class_idx: int | torch.tensor | None + Class index[es] to generate. If the model was not trained on more than 1 + class, this is unused. + + If an integer, generate all batches with the class index defined by the integer. + + If a tensor, provide tensor of either shape: + [N, self.num_classes]. + If a value of class_idx[b, n] is 1, that class will be generated. + A maximum of 1 class can be set to 1 per batch. + [N] + Each element is a class index. + Generate one batch for each provided class index. + + raw_output: + If true, returns a tensor of N generated RGB images. It has shape [N, 3, self.output_dims, self.output_dims]. + Otherwise, returns List[PIL.Image] + + Returns: + See raw_output parameter description. + """ + with torch.no_grad(): + if image_noise is None: + image_noise = self.generate_random_vec( + batch_size=class_idx.shape[0] if class_idx is not None else 1 + ) + + if self.num_classes != 0: + if isinstance(class_idx, int): + class_idx = torch.Tensor([class_idx] * image_noise.shape[0]) + + if isinstance(class_idx, torch.Tensor) and len(class_idx.shape) == 1: + # Convert from [N] class index to one-hot [N, # of classes] + assert class_idx.dtype == torch.int + model_classes = torch.nn.functional.one_hot( + class_idx, self.num_classes + ) + else: + model_classes = class_idx + + image_tensor = self.model(image_noise, model_classes) + else: + image_tensor = self.model(image_noise) + + image_tensor = ( + (image_tensor.permute(0, 2, 3, 1) * 127.5 + 128) + .clamp(0, 255) + .to(torch.uint8) + ) + + if raw_output: + return image_tensor + + image_list = [] + for image_tensor in image_tensor: + image_list.append(Image.fromarray(image_tensor.numpy(), "RGB")) + return image_list diff --git a/qai_hub_models/models/stylegan2/demo.py b/qai_hub_models/models/stylegan2/demo.py new file mode 100644 index 00000000..d53c296b --- /dev/null +++ b/qai_hub_models/models/stylegan2/demo.py @@ -0,0 +1,72 @@ +import torch + +from qai_hub_models.models.stylegan2.app import StyleGAN2App +from qai_hub_models.models.stylegan2.model import StyleGAN2 +from qai_hub_models.utils.args import ( + add_output_dir_arg, + get_model_cli_parser, + model_from_cli_args, +) +from qai_hub_models.utils.display import display_or_save_image + + +def main(is_test: bool = False): + parser = get_model_cli_parser(StyleGAN2) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Random seed to use for image generation.", + ) + parser.add_argument( + "--num-images", + type=int, + default=1, + help="Number of images to generate (all computed in 1 inference call).", + ) + add_output_dir_arg(parser) + parser.add_argument( + "--classes", + type=int, + nargs="*", + default=None, + help="Class[es] to use for image generation (if applicable).", + ) + args = parser.parse_args([] if is_test else None) + + # Create model and app + model = model_from_cli_args(StyleGAN2, args) + assert isinstance(model, StyleGAN2) + app = StyleGAN2App(model, model.output_size, model.num_classes) + + # Verify model input args + if model.num_classes == 0 and args.classes: + raise ValueError( + "Classes cannot be provided for models trained without classes." + ) + if args.classes and len(args.classes) > 1 and len(args.classes) != args.num_images: + raise ValueError( + "You may provide 1 class for all images, or one class per image." + ) + if not args.classes and model.num_classes: + args.classes = [0] # Default to class 0 + + # Get desired batch size + batch_size = len(args.classes) if args.classes else args.num_images + + # Generate input and run inference + z = app.generate_random_vec(batch_size=batch_size, seed=args.seed) + images = app.generate_images( + z, + class_idx=torch.Tensor(args.classes).type(torch.int) if args.classes else None, + ) + + # Display images + assert isinstance(images, list) + if not is_test: + for (i, image) in enumerate(images): + display_or_save_image(image, args.output_dir, f"image_{i}.png") + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stylegan2/export.py b/qai_hub_models/models/stylegan2/export.py new file mode 100644 index 00000000..1ceb7788 --- /dev/null +++ b/qai_hub_models/models/stylegan2/export.py @@ -0,0 +1,182 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.stylegan2 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "stylegan2" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "stylegan2", + "StyleGAN2", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_output output_0" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=sample_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stylegan2/info.yaml b/qai_hub_models/models/stylegan2/info.yaml new file mode 100644 index 00000000..4337f9b4 --- /dev/null +++ b/qai_hub_models/models/stylegan2/info.yaml @@ -0,0 +1,31 @@ +name: StyleGAN2 +# id must match with the model dir name in qai_hub_models +id: stylegan2 +status: public +headline: Generate realistic, randomized images of real classes. +domain: Computer Vision +description: StyleGAN2 is a machine learning model that generates realistic images from random input state vectors. +use_case: Image Generation +tags: + - real-time + - generative-ai +research_paper: http://arxiv.org/abs/1912.04958 +research_paper_title: "Analyzing and Improving the Image Quality of StyleGAN" +license: https://github.com/NVlabs/stylegan3/blob/main/LICENSE.txt +source_repo: https://github.com/NVlabs/stylegan3 +technical_details: + Number of parameters: 30M + Model size: 118 MB + Model checkpoint: StyleGAN2 (afhqcat dataset) + Input resolution: 1x512 +applicable_scenarios: [] +related_models: [] +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: other +dataset: [] diff --git a/qai_hub_models/models/stylegan2/model.py b/qai_hub_models/models/stylegan2/model.py new file mode 100644 index 00000000..d1de0f79 --- /dev/null +++ b/qai_hub_models/models/stylegan2/model.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +from typing import Any, Callable, Dict, List + +import numpy as np +import torch + +from qai_hub_models.utils.asset_loaders import SourceAsRoot +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +STYLEGAN2_SOURCE_REPOSITORY = "https://github.com/NVlabs/stylegan3" +STYLEGAN2_SOURCE_REPO_COMMIT = "c233a919a6faee6e36a316ddd4eddababad1adf9" +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_WEIGHTS = ( + "https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/afhqcat.pkl" +) + + +class StyleGAN2(BaseModel): + """Exportable StyleGAN2 image generator.""" + + def __init__( + self, + generator: torch.nn.Module, + noise_mode="const", + ) -> None: + """ + Create a StyleGAN2 model + + Parameters: + generator: + Generator object loaded from the StyleGAN repositoru. + noise_mode: + Controls noise model introduces into the input. + Options: 'const', 'random', 'none' + """ + super().__init__() + self.generator = generator + self.output_size: int = self.generator.z_dim # type: ignore + self.num_classes: int = self.generator.c_dim # type: ignore + self.noise_mode = noise_mode + assert noise_mode in ["const", "random", "none"] + + @staticmethod + def from_pretrained(model_url_or_path: str = DEFAULT_WEIGHTS): + """Load StyleGAN2 from a pickled styleGAN2 file.""" + return StyleGAN2(_load_stylegan2_source_model_from_weights(model_url_or_path)) + + def forward(self, image_noise: torch.Tensor, classes: torch.Tensor | None = None): + """ + Generate an image. + + Parameters: + image_noise: torch.Tensor | None + Random state vector from which images should be generated. + Shape: [ N, self.output_size ] + + classes: torch.tensor + Tensor of shape [N, self.num_classes]. + If a value of class_idx[b, n] is 1, that class will be generated. + A maximum of 1 class can be set to 1 per batch. + + Returns: + A tensor of N generated RGB images. It has shape [N, self.output_size, self.output_size, 3]. + """ + if classes is None: + classes = torch.zeros((image_noise.shape[0], self.num_classes)) + if self.num_classes != 0: + classes[:, 0] = 1 # Select first class as default + + return self.generator( + image_noise, + classes, + truncation_psi=1, + noise_mode=self.noise_mode, + force_fp32=True, + ) + + def get_input_spec(self, batch_size: int = 1) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit a profiling job on Qualcomm AI Hub. + """ + inputs = {"image_noise": ((batch_size, self.output_size), "float32")} + if self.num_classes != 0: + inputs["classes"] = ((batch_size, self.num_classes), "float32") + return inputs # type: ignore + + def sample_inputs( + self, input_spec: InputSpec | None = None, seed=None + ) -> Dict[str, List[np.ndarray]]: + if not input_spec: + input_spec = self.get_input_spec() + + inputs = { + "image_noise": [ + np.random.RandomState(seed) + .randn(*input_spec["image_noise"][0]) + .astype(np.float32) + ] + } + if "classes" in input_spec: + classes = np.zeros(input_spec["classes"][0]).astype(np.float32) + if input_spec["classes"][0][1] != 0: + classes[:, 0] = 1 # Select first class as default + inputs["classes"] = [classes] + + return inputs + + +def _get_qaihm_upfirdn2d_ref(misc: Any, conv2d_gradfix: Callable, upfirdn2d: Any): + """ + Get patched upfirdn2d function implementation that is export compatible. + This replaces an implementation provided by the stylegan3 repository. + Params are imports from the stylegan3 repository (see _load_stylegan2_source_model_from_weights). + """ + + @misc.profiled_function + def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1): + """Slow reference implementation of `upfirdn2d()` using standard PyTorch ops.""" + # Validate arguments. + assert isinstance(x, torch.Tensor) and x.ndim == 4 + if f is None: + f = torch.ones([1, 1], dtype=torch.float32, device=x.device) + assert isinstance(f, torch.Tensor) and f.ndim in [1, 2] + assert f.dtype == torch.float32 and not f.requires_grad + batch_size, num_channels, in_height, in_width = x.shape + upx, upy = upfirdn2d._parse_scaling(up) + downx, downy = upfirdn2d._parse_scaling(down) + padx0, padx1, pady0, pady1 = upfirdn2d._parse_padding(padding) + + # Upsample by inserting zeros. + + # ===== Local change start ===== + # Avoid rank 6. + # x = x.reshape([batch_size, num_channels, in_height, 1, in_width, 1]) + x = x.reshape([batch_size * num_channels, in_height, 1, in_width, 1]) + # ===== Local change end ===== + + x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1]) + x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx]) + + # Pad or crop. + x = torch.nn.functional.pad( + x, [max(padx0, 0), max(padx1, 0), max(pady0, 0), max(pady1, 0)] + ) + x = x[ + :, + :, + max(-pady0, 0) : x.shape[2] - max(-pady1, 0), + max(-padx0, 0) : x.shape[3] - max(-padx1, 0), + ] + + # Setup filter. + f = f * (gain ** (f.ndim / 2)) + f = f.to(x.dtype) + if not flip_filter: + f = f.flip(list(range(f.ndim))) + + # Convolve with the filter. + f = f[np.newaxis, np.newaxis].repeat([num_channels, 1] + [1] * f.ndim) + if f.ndim == 4: + x = conv2d_gradfix.conv2d(input=x, weight=f, groups=num_channels) + else: + x = conv2d_gradfix.conv2d( + input=x, weight=f.unsqueeze(2), groups=num_channels + ) + x = conv2d_gradfix.conv2d( + input=x, weight=f.unsqueeze(3), groups=num_channels + ) + + # Downsample by throwing away pixels. + x = x[:, :, ::downy, ::downx] + return x + + return _upfirdn2d_ref + + +def _load_stylegan2_source_model_from_weights( + model_url_or_path: str, +) -> torch.nn.Module: + # Load StyleGAN model from the source repository using the given weights. + with SourceAsRoot( + STYLEGAN2_SOURCE_REPOSITORY, + STYLEGAN2_SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + ): + # Patch rank 6 tensor that can't be exported + from torch_utils import misc + from torch_utils.ops import conv2d_gradfix, upfirdn2d + + upfirdn2d._upfirdn2d_ref = _get_qaihm_upfirdn2d_ref( + misc, conv2d_gradfix, upfirdn2d + ) + + # Load model + import dnnlib + import legacy + + with dnnlib.util.open_url(model_url_or_path) as f: + # Get generator + return legacy.load_network_pkl(f)["G_ema"] diff --git a/qai_hub_models/models/stylegan2/perf.yaml b/qai_hub_models/models/stylegan2/perf.yaml new file mode 100644 index 00000000..c762bf13 --- /dev/null +++ b/qai_hub_models/models/stylegan2/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: StyleGAN2 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1280066.0 + throughput: 0.7812097188738706 + estimated_peak_memory_range: + min: 1790029824 + max: 2607953504 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 89 + layers_on_cpu: 462 + total_layers: 551 + job_id: jz57elvqp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:31:56.125164Z' diff --git a/qai_hub_models/models/stylegan2/requirements.txt b/qai_hub_models/models/stylegan2/requirements.txt new file mode 100644 index 00000000..b8261cd7 --- /dev/null +++ b/qai_hub_models/models/stylegan2/requirements.txt @@ -0,0 +1 @@ +click>=8.0 diff --git a/qai_hub_models/models/stylegan2/test.py b/qai_hub_models/models/stylegan2/test.py new file mode 100644 index 00000000..7389c557 --- /dev/null +++ b/qai_hub_models/models/stylegan2/test.py @@ -0,0 +1,66 @@ +import numpy as np +import torch + +from qai_hub_models.models.stylegan2.app import StyleGAN2App +from qai_hub_models.models.stylegan2.demo import main as demo_main +from qai_hub_models.models.stylegan2.model import ( + DEFAULT_WEIGHTS, + MODEL_ASSET_VERSION, + MODEL_ID, + StyleGAN2, + _load_stylegan2_source_model_from_weights, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check + +SAMPLE_GENERATOR_RANDOM_SEED = 1000 +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "output_sample_image.png" +) + + +@skip_clone_repo_check +def test_task(): + source_model = _load_stylegan2_source_model_from_weights(DEFAULT_WEIGHTS) + qaihm_model = StyleGAN2.from_pretrained(DEFAULT_WEIGHTS) + z = StyleGAN2App(qaihm_model).generate_random_vec(seed=SAMPLE_GENERATOR_RANDOM_SEED) + + with torch.no_grad(): + assert_most_close( + source_model(z, [[]], noise_mode="const", force_fp32=True), + qaihm_model(z), + 0.005, + ) + + +@skip_clone_repo_check +def test_stylegan2_app(): + app = StyleGAN2App(StyleGAN2.from_pretrained()) + + # App generates expected image + z = app.generate_random_vec(seed=SAMPLE_GENERATOR_RANDOM_SEED) + expected = np.asarray(load_image(OUTPUT_IMAGE_ADDRESS).convert("RGB")) + output = np.asarray(app.generate_images(z, raw_output=True)) + assert_most_close(output, expected, 0.005) + + # App can generate multiple images + output_images = app.generate_images(class_idx=torch.Tensor([1, 2]).type(torch.int)) + assert len(output_images) == 2 + + +@skip_clone_repo_check +def test_stylegan2_trace(): + app = StyleGAN2App(StyleGAN2.from_pretrained().convert_to_torchscript()) + + # App generates expected image + z = app.generate_random_vec(seed=SAMPLE_GENERATOR_RANDOM_SEED) + expected = np.asarray(load_image(OUTPUT_IMAGE_ADDRESS).convert("RGB")) + output = np.asarray(app.generate_images(z, raw_output=True))[0] + + assert_most_close(output, expected, 0.005) + + +@skip_clone_repo_check +def test_stylegan2_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/swin_base/README.md b/qai_hub_models/models/swin_base/README.md new file mode 100644 index 00000000..6d48fce0 --- /dev/null +++ b/qai_hub_models/models/swin_base/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Swin-Base: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/swin_base) + +SwinBase is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of Swin-Base found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/swin_base). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.swin_base.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.swin_base.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Swin-Base can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py) diff --git a/qai_hub_models/models/swin_base/__init__.py b/qai_hub_models/models/swin_base/__init__.py new file mode 100644 index 00000000..3aee3c02 --- /dev/null +++ b/qai_hub_models/models/swin_base/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import SwinBase as Model # noqa: F401 diff --git a/qai_hub_models/models/swin_base/demo.py b/qai_hub_models/models/swin_base/demo.py new file mode 100644 index 00000000..2624eeb2 --- /dev/null +++ b/qai_hub_models/models/swin_base/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.swin_base.model import SwinBase + + +def main(is_test: bool = False): + imagenet_demo(SwinBase, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/swin_base/export.py b/qai_hub_models/models/swin_base/export.py new file mode 100644 index 00000000..94b75161 --- /dev/null +++ b/qai_hub_models/models/swin_base/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.swin_base import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "swin_base" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "swin_base", + "Swin-Base", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/swin_base/info.yaml b/qai_hub_models/models/swin_base/info.yaml new file mode 100644 index 00000000..d3d8719d --- /dev/null +++ b/qai_hub_models/models/swin_base/info.yaml @@ -0,0 +1,40 @@ +name: Swin-Base +# id must match with the model dir name in qai_hub_models +id: swin_base +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: SwinBase is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone +research_paper: https://arxiv.org/abs/2103.14030 +research_paper_title: "Swin Transformer: Hierarchical Vision Transformer using Shifted Windows" +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py +technical_details: + Number of parameters: 87.8M + Model size: 339 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - swin_tiny + - swin_small + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/swin_base/model.py b/qai_hub_models/models/swin_base/model.py new file mode 100644 index 00000000..bc68c9d4 --- /dev/null +++ b/qai_hub_models/models/swin_base/model.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import torch +import torchvision.models as tv_models +from torchvision.models.swin_transformer import PatchMerging, ShiftedWindowAttention + +from qai_hub_models.models._shared.common import replace_module_recursively +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier +from qai_hub_models.models._shared.swin.swin_transformer import ( + AutoSplitLinear, + ShiftedWindowAttentionInf, +) + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class SwinBase(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.swin_b(weights=weights) + replace_module_recursively( + net, ShiftedWindowAttention, ShiftedWindowAttentionInf + ) + replace_module_recursively( + net, torch.nn.Linear, AutoSplitLinear, parent_module=PatchMerging + ) + return cls(net.eval()) diff --git a/qai_hub_models/models/swin_base/perf.yaml b/qai_hub_models/models/swin_base/perf.yaml new file mode 100644 index 00000000..c27cc0d3 --- /dev/null +++ b/qai_hub_models/models/swin_base/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Swin-Base + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 76852.0 + throughput: 13.012023109353041 + estimated_peak_memory_range: + min: 12288 + max: 367871696 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 2006 + layers_on_cpu: 0 + total_layers: 2006 + job_id: jw568zrvg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:09:41.513292Z' diff --git a/qai_hub_models/models/swin_base/test.py b/qai_hub_models/models/swin_base/test.py new file mode 100644 index 00000000..c5da7795 --- /dev/null +++ b/qai_hub_models/models/swin_base/test.py @@ -0,0 +1,35 @@ +import numpy as np +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( # noqa: F401 + imagenet_sample_torch, + run_imagenet_classifier_test, +) +from qai_hub_models.models.swin_base.demo import main as demo_main +from qai_hub_models.models.swin_base.model import MODEL_ID, SwinBase + + +def test_task(imagenet_sample_torch): + # Ensure that the optimized SwinBase matches the original one numerically + x = imagenet_sample_torch + model_opt = SwinBase.from_pretrained().eval() + model_orig = tv_models.swin_b(weights="IMAGENET1K_V1").eval() + np.testing.assert_allclose( + model_opt(x).detach().numpy(), + model_orig(x).detach().numpy(), + atol=1e-5, + rtol=1e-3, + ) + + +def test_task(): + run_imagenet_classifier_test( + SwinBase.from_pretrained(), + MODEL_ID, + probability_threshold=0.53, + asset_version=1, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/swin_small/README.md b/qai_hub_models/models/swin_small/README.md new file mode 100644 index 00000000..de3c2ec5 --- /dev/null +++ b/qai_hub_models/models/swin_small/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Swin-Small: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/swin_small) + +SwinSmall is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of Swin-Small found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/swin_small). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.swin_small.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.swin_small.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Swin-Small can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py) diff --git a/qai_hub_models/models/swin_small/__init__.py b/qai_hub_models/models/swin_small/__init__.py new file mode 100644 index 00000000..8533c9c3 --- /dev/null +++ b/qai_hub_models/models/swin_small/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import SwinSmall as Model # noqa: F401 diff --git a/qai_hub_models/models/swin_small/demo.py b/qai_hub_models/models/swin_small/demo.py new file mode 100644 index 00000000..5fcbea2f --- /dev/null +++ b/qai_hub_models/models/swin_small/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.swin_small.model import SwinSmall + + +def main(is_test: bool = False): + imagenet_demo(SwinSmall, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/swin_small/export.py b/qai_hub_models/models/swin_small/export.py new file mode 100644 index 00000000..69997876 --- /dev/null +++ b/qai_hub_models/models/swin_small/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.swin_small import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "swin_small" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "swin_small", + "Swin-Small", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/swin_small/info.yaml b/qai_hub_models/models/swin_small/info.yaml new file mode 100644 index 00000000..1930f9af --- /dev/null +++ b/qai_hub_models/models/swin_small/info.yaml @@ -0,0 +1,43 @@ +name: Swin-Small +# id must match with the model dir name in qai_hub_models +id: swin_small +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: SwinSmall is a machine learning model that can classify images from the + Imagenet dataset. It can also be used as a backbone in building more complex models + for specific use cases. +use_case: Image Classification +tags: +- backbone +research_paper: https://arxiv.org/abs/2103.14030 +research_paper_title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted + Windows' +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: + https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py +technical_details: + Number of parameters: 49.6M + Model size: 193 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: +- Medical Imaging +- Anomaly Detection +- Inventory Management +related_models: +- swin_tiny +- swin_base +- mobilenet_v2 +- densenet121 +- googlenet +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/swin_small/model.py b/qai_hub_models/models/swin_small/model.py new file mode 100644 index 00000000..f06aa4e4 --- /dev/null +++ b/qai_hub_models/models/swin_small/model.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import torch +import torchvision.models as tv_models +from torchvision.models.swin_transformer import PatchMerging, ShiftedWindowAttention + +from qai_hub_models.models._shared.common import replace_module_recursively +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier +from qai_hub_models.models._shared.swin.swin_transformer import ( + AutoSplitLinear, + ShiftedWindowAttentionInf, +) + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class SwinSmall(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.swin_s(weights=weights) + replace_module_recursively( + net, ShiftedWindowAttention, ShiftedWindowAttentionInf + ) + replace_module_recursively( + net, torch.nn.Linear, AutoSplitLinear, parent_module=PatchMerging + ) + return cls(net.eval()) diff --git a/qai_hub_models/models/swin_small/perf.yaml b/qai_hub_models/models/swin_small/perf.yaml new file mode 100644 index 00000000..d481ee20 --- /dev/null +++ b/qai_hub_models/models/swin_small/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Swin-Small + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 52492.0 + throughput: 19.05052198430237 + estimated_peak_memory_range: + min: 12288 + max: 222000632 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1965 + layers_on_cpu: 0 + total_layers: 1965 + job_id: jlpe7wl05 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:30:42.368348Z' diff --git a/qai_hub_models/models/swin_small/test.py b/qai_hub_models/models/swin_small/test.py new file mode 100644 index 00000000..dd2ed897 --- /dev/null +++ b/qai_hub_models/models/swin_small/test.py @@ -0,0 +1,35 @@ +import numpy as np +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( # noqa: F401 + imagenet_sample_torch, + run_imagenet_classifier_test, +) +from qai_hub_models.models.swin_small.demo import main as demo_main +from qai_hub_models.models.swin_small.model import MODEL_ID, SwinSmall + + +def test_task(imagenet_sample_torch): + # Ensure that the optimized SwinSmall matches the original one numerically + x = imagenet_sample_torch + model_opt = SwinSmall.from_pretrained().eval() + model_orig = tv_models.swin_s(weights="IMAGENET1K_V1").eval() + np.testing.assert_allclose( + model_opt(x).detach().numpy(), + model_orig(x).detach().numpy(), + atol=1e-5, + rtol=1e-3, + ) + + +def test_task(): + run_imagenet_classifier_test( + SwinSmall.from_pretrained(), + MODEL_ID, + probability_threshold=0.53, + asset_version=1, + ) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/swin_tiny/README.md b/qai_hub_models/models/swin_tiny/README.md new file mode 100644 index 00000000..407b8fd2 --- /dev/null +++ b/qai_hub_models/models/swin_tiny/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Swin-Tiny: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/swin_tiny) + +SwinTiny is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of Swin-Tiny found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/swin_tiny). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.swin_tiny.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.swin_tiny.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Swin-Tiny can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py) diff --git a/qai_hub_models/models/swin_tiny/__init__.py b/qai_hub_models/models/swin_tiny/__init__.py new file mode 100644 index 00000000..c58e51a0 --- /dev/null +++ b/qai_hub_models/models/swin_tiny/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import SwinTiny as Model # noqa: F401 diff --git a/qai_hub_models/models/swin_tiny/demo.py b/qai_hub_models/models/swin_tiny/demo.py new file mode 100644 index 00000000..662f4482 --- /dev/null +++ b/qai_hub_models/models/swin_tiny/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.swin_tiny.model import SwinTiny + + +def main(is_test: bool = False): + imagenet_demo(SwinTiny, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/swin_tiny/export.py b/qai_hub_models/models/swin_tiny/export.py new file mode 100644 index 00000000..6d7e0810 --- /dev/null +++ b/qai_hub_models/models/swin_tiny/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.swin_tiny import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "swin_tiny" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "swin_tiny", + "Swin-Tiny", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options + " --compute_unit gpu", + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/swin_tiny/info.yaml b/qai_hub_models/models/swin_tiny/info.yaml new file mode 100644 index 00000000..7b184b7d --- /dev/null +++ b/qai_hub_models/models/swin_tiny/info.yaml @@ -0,0 +1,43 @@ +name: Swin-Tiny +# id must match with the model dir name in qai_hub_models +id: swin_tiny +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: SwinTiny is a machine learning model that can classify images from the + Imagenet dataset. It can also be used as a backbone in building more complex models + for specific use cases. +use_case: Image Classification +tags: +- backbone +research_paper: https://arxiv.org/abs/2103.14030 +research_paper_title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted + Windows' +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: + https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py +technical_details: + Number of parameters: 28.3M + Model size: 110 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: +- Medical Imaging +- Anomaly Detection +- Inventory Management +related_models: +- swin_small +- swin_base +- mobilenet_v2 +- densenet121 +- googlenet +form_factors: +- Phone +- Tablet +- IoT +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/swin_tiny/model.py b/qai_hub_models/models/swin_tiny/model.py new file mode 100644 index 00000000..5dd0fa23 --- /dev/null +++ b/qai_hub_models/models/swin_tiny/model.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import torch +import torchvision.models as tv_models +from torchvision.models.swin_transformer import PatchMerging, ShiftedWindowAttention + +from qai_hub_models.models._shared.common import replace_module_recursively +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier +from qai_hub_models.models._shared.swin.swin_transformer import ( + AutoSplitLinear, + ShiftedWindowAttentionInf, +) + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class SwinTiny(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.swin_t(weights=weights) + replace_module_recursively( + net, ShiftedWindowAttention, ShiftedWindowAttentionInf + ) + replace_module_recursively( + net, torch.nn.Linear, AutoSplitLinear, parent_module=PatchMerging + ) + return cls(net.eval()) diff --git a/qai_hub_models/models/swin_tiny/perf.yaml b/qai_hub_models/models/swin_tiny/perf.yaml new file mode 100644 index 00000000..7603ecf3 --- /dev/null +++ b/qai_hub_models/models/swin_tiny/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Swin-Tiny + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 29469.0 + throughput: 33.93396450507313 + estimated_peak_memory_range: + min: 0 + max: 193113472 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1059 + layers_on_cpu: 0 + total_layers: 1059 + job_id: jqpyojx45 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:18:27.047126Z' diff --git a/qai_hub_models/models/swin_tiny/test.py b/qai_hub_models/models/swin_tiny/test.py new file mode 100644 index 00000000..efecccaa --- /dev/null +++ b/qai_hub_models/models/swin_tiny/test.py @@ -0,0 +1,38 @@ +import numpy as np +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( # noqa: F401 + imagenet_sample_torch, + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.swin_tiny.demo import main as demo_main +from qai_hub_models.models.swin_tiny.model import MODEL_ID, SwinTiny + + +def test_task(imagenet_sample_torch): + # Ensure that the optimized SwinTiny matches the original one numerically + x = imagenet_sample_torch + model_opt = SwinTiny.from_pretrained().eval() + model_orig = tv_models.swin_t(weights="IMAGENET1K_V1").eval() + np.testing.assert_allclose( + model_opt(x).detach().numpy(), + model_orig(x).detach().numpy(), + atol=1e-5, + rtol=1e-3, + ) + + +def test_task(): + run_imagenet_classifier_test( + SwinTiny.from_pretrained(), MODEL_ID, probability_threshold=0.53 + ) + + +def test_trace(): + run_imagenet_classifier_trace_test(SwinTiny.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/trocr/README.md b/qai_hub_models/models/trocr/README.md new file mode 100644 index 00000000..8c87e6a4 --- /dev/null +++ b/qai_hub_models/models/trocr/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [TrOCR: Transformer based model for state-of-the-art optical character recognition (OCR) on both printed and handwritten text](https://aihub.qualcomm.com/models/trocr) + +End-to-end text recognition approach with pre-trained image transformer and text transformer models for both image understanding and wordpiece-level text generation. + +This is based on the implementation of TrOCR found +[here](https://huggingface.co/microsoft/trocr-small-stage1). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/trocr). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[trocr]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.trocr.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.trocr.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of TrOCR can be found + [here](https://github.com/microsoft/unilm/blob/master/LICENSE). + + +## References +* [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) +* [Source Model Implementation](https://huggingface.co/microsoft/trocr-small-stage1) diff --git a/qai_hub_models/models/trocr/__init__.py b/qai_hub_models/models/trocr/__init__.py new file mode 100644 index 00000000..ae01a3a7 --- /dev/null +++ b/qai_hub_models/models/trocr/__init__.py @@ -0,0 +1,3 @@ +from .app import TrOCRApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import TrOCR as Model # noqa: F401 diff --git a/qai_hub_models/models/trocr/app.py b/qai_hub_models/models/trocr/app.py new file mode 100644 index 00000000..972ef8ad --- /dev/null +++ b/qai_hub_models/models/trocr/app.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +from typing import Generator, List + +import torch +from PIL.Image import Image + +from qai_hub_models.models.trocr.model import KVCache, TrOCR + + +class TrOCRApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference with TrOCR. + + The app uses 2 models: + * encoder (modified to return cross attention key-value) + * decoder + + For a given image input, the app will: + * use the io_processor to pre-process the image (reshape & normalize) + * call the encoder once + * run the decoder in a loop until the "end of sentence" token is predicted, + or the max sequence length (defined by the source model config) is reached + * map the output tokens to a string via `io_processor`. + """ + + def __init__(self, model: TrOCR): + self.encoder = model.encoder + self.decoder = model.decoder + self.io_processor = model.io_processor + + self.pad_token_id = model.pad_token_id + self.eos_token_id = model.eos_token_id + self.start_token_id = model.start_token_id + self.max_seq_len = model.max_seq_len + + def preprocess_image(self, image: Image) -> torch.Tensor: + """Convert a raw image (resize, normalize) into a pyTorch tensor that can be used as input to TrOCR inference. + This also converts the image to RGB, which is the expected input channel layout for TrOCR. + + For more information on preprocessing, see https://huggingface.co/docs/transformers/preprocessing.""" + assert ( + self.io_processor is not None + ), "TrOCR processor most be provided to use type Image as an input." + return self.io_processor(image.convert("RGB"), return_tensors="pt").pixel_values + + def predict(self, *args, **kwargs): + # See predict_text_from_image. + return self.predict_text_from_image(*args, **kwargs) + + def predict_text_from_image( + self, pixel_values_or_image: torch.Tensor | Image, raw_output: bool = False + ) -> torch.Tensor | List[str]: + """ + From the provided image or tensor, predict the line of text contained within. + + Parameters: + pixel_values_or_image: torch.Tensor + Input PIL image (before pre-processing) or pyTorch tensor (after image pre-processing). + raw_output: bool + If false, return a list of predicted strings (one for each batch). Otherwise, return a tensor of predicted token IDs. + + Returns: + The output word / token sequence (representative of the text contained in the input image). + + The prediction will be a list of strings (one string per batch) if self.io_processor != None and raw_output=False. + Otherwise, a `torch.Tensor` of shape [batch_size, predicted_sequence_length] is returned. It contains predicted token IDs. + """ + gen = self.stream_predicted_text_from_image(pixel_values_or_image, raw_output) + _ = last = next(gen) + for last in gen: + pass + return last + + def stream_predicted_text_from_image( + self, pixel_values_or_image: torch.Tensor | Image, raw_output: bool = False + ) -> Generator[torch.Tensor | List[str], None, None]: + """ + From the provided image or tensor, predict the line of text contained within. + The returned generator will produce a single output per decoder iteration. + + The generator allows the client to "stream" output from the decoder + (eg. get the prediction one word at as time as they're predicted, instead of waiting for the entire output sequence to be predicted) + + Parameters: + pixel_values_or_image: torch.Tensor + Input PIL image (before pre-processing) or pyTorch tensor (after image pre-processing). + raw_output: bool + If false, return a list of predicted strings (one for each batch). Otherwise, return a tensor of predicted token IDs. + + Returns: + A python generator for the output word / token sequence (representative of the text contained in the input image). + The generator will produce one output for every decoder iteration. + + The prediction will be a list of strings (one string per batch) if self.io_processor != None and raw_output=False. + Otherwise, a `torch.Tensor` of shape [batch_size, predicted_sequence_length] is returned. It contains predicted token IDs. + """ + if isinstance(pixel_values_or_image, Image): + pixel_values = self.preprocess_image(pixel_values_or_image) + else: + pixel_values = pixel_values_or_image + + batch_size = pixel_values.shape[0] + eos_token_id_tensor = torch.tensor([self.eos_token_id], dtype=torch.int32) + + # Run encoder + kv_cache_cross_attn = self.encoder(pixel_values) + + # Initial KV Cache + initial_attn_cache = get_empty_attn_cache( + batch_size, + self.decoder.num_decoder_layers, + self.decoder.decoder_attention_heads, + self.decoder.embeddings_per_head, + ) + initial_kv_cache = combine_kv_caches(kv_cache_cross_attn, initial_attn_cache) + kv_cache = initial_kv_cache + + # Prepare decoder input IDs. Shape: [batch_size, 1] + initial_input_ids = ( + torch.ones((batch_size, 1), dtype=torch.int32) * self.start_token_id + ) + input_ids = initial_input_ids + + # Prepare decoder output IDs. Shape: [batch_size, seq_len] + output_ids = input_ids + + # Keep track of which sequences are already finished. Shape: [batch_size] + unfinished_sequences = torch.ones(batch_size, dtype=torch.int32) + + while unfinished_sequences.max() != 0 and ( + self.max_seq_len is None or output_ids.shape[-1] < self.max_seq_len + ): + # Get next tokens. Shape: [batch_size] + outputs = self.decoder(input_ids, *kv_cache) + next_tokens = outputs[0] + kv_cache_attn = outputs[1:] + + # Finished sentences should have padding token appended instead of the prediction. + next_tokens = next_tokens * unfinished_sequences + self.pad_token_id * ( + 1 - unfinished_sequences + ) + + input_ids = torch.unsqueeze(next_tokens, -1) + output_ids = torch.cat([output_ids, input_ids], dim=-1) + yield self.io_processor.batch_decode( + output_ids, skip_special_tokens=True + ) if self.io_processor and not raw_output else output_ids + + # if eos_token was found in one sentence, set sentence to finished + if eos_token_id_tensor is not None: + unfinished_sequences = unfinished_sequences.mul( + torch.unsqueeze(next_tokens, -1) + .ne(eos_token_id_tensor.unsqueeze(1)) + .prod(dim=0) + .type(torch.int32) + ) + + # Re-construct kv cache with new sequence. + kv_cache = combine_kv_caches(kv_cache_cross_attn, kv_cache_attn) + + +def combine_kv_caches( + kv_cache_cross_attn: KVCache, + kv_cache_attn: KVCache, +) -> KVCache: + """ + Generates full KV Cache from cross attention KV cache and attention KV cache. + + Parameters: + kv_cache_cross_attn: Tuple[kv_cache_cross_attn_0_key, kv_cache_cross_attn_0_val, cv_cache_cross_attn_1_key, ...] + Cross attn KV cache generated by CrossAttnKVGenerator. + len(tuple) == 2 * number of source model decoder layers. + + kv_cache_attn: Tuple[kv_cache_attn_0_key, kv_cache_attn_0_val cv_cache_attn_1_key, ...] + Attn generated by the decoder, or None (generate empty cache) if the decoder has not run yet. + len(tuple) == 2 * number of source model decoder layers. + + Returns: + kv_cache: Tuple[kv_cache_attn_0_key, kv_cache_attn_0_val, + kv_cache_cross_attn_0_key, kv_cache_cross_attn_0_val, + kv_cache_attn_1_key, ...] + Combined KV Cache. + len(tuple) == 4 * number of source model decoder layers. + """ + # Construct remaining kv cache with a new empty sequence. + kv_cache = [torch.Tensor()] * len(kv_cache_cross_attn) * 2 + + # Combine KV Cache. + for i in range(0, len(kv_cache_cross_attn) // 2): + kv_cache[4 * i] = kv_cache_attn[2 * i] + kv_cache[4 * i + 1] = kv_cache_attn[2 * i + 1] + kv_cache[4 * i + 2] = kv_cache_cross_attn[2 * i] + kv_cache[4 * i + 3] = kv_cache_cross_attn[2 * i + 1] + + return (*kv_cache,) + + +def get_empty_attn_cache( + batch_size: int, + num_decoder_layers: int, + decoder_attention_heads: int, + embeddings_per_head: int, +) -> KVCache: + """ + Generates empty cross attn KV Cache for use in the first iteration of the decoder. + + Parameters: + batch_size: Batch size. + num_decoder_layers: NUmber of decoder layers in the decoder. + decoder_attention_heads: Number of attention heads in the decoder. + embeddings_per_head: The count of the embeddings in each decoder attention head. + + Returns: + kv_cache: Tuple[kv_cache_attn_0_key, kv_cache_attn_0_val, kv_cache_attn_1_key, ...] + len(tuple) == 2 * number of source model decoder layers. + """ + kv_cache = [] + for i in range(0, num_decoder_layers): + kv_cache.append( + torch.zeros( + ( + batch_size, + decoder_attention_heads, + 0, + embeddings_per_head, + ) + ) + ) + kv_cache.append( + torch.zeros( + ( + batch_size, + decoder_attention_heads, + 0, + embeddings_per_head, + ) + ) + ) + return (*kv_cache,) diff --git a/qai_hub_models/models/trocr/demo.py b/qai_hub_models/models/trocr/demo.py new file mode 100644 index 00000000..8ec2db9c --- /dev/null +++ b/qai_hub_models/models/trocr/demo.py @@ -0,0 +1,53 @@ +import time + +from qai_hub_models.models.trocr.app import TrOCRApp +from qai_hub_models.models.trocr.model import ( + HUGGINGFACE_TROCR_MODEL, + MODEL_ASSET_VERSION, + MODEL_ID, + TrOCR, +) +from qai_hub_models.utils.args import get_model_cli_parser, model_from_cli_args +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image + +HUGGINGFACE_TROCR_MODEL = "microsoft/trocr-small-stage1" +DEFAULT_SAMPLE_IMAGE = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "sample_text.jpg" +) + + +# Run TrOCR end-to-end on a sample line of handwriting. +# The demo will output the text contained within the source image. +# Text will be printed to terminal as it is generated with each decoder loop. +def main(is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(TrOCR) + parser.add_argument( + "--image", + type=str, + default=DEFAULT_SAMPLE_IMAGE, + help="image file path or URL", + ) + args = parser.parse_args([] if is_test else None) + + # Load Application + app = TrOCRApp(model_from_cli_args(TrOCR, args)) + + # Load Image + image = load_image(args.image) + + # Stream output from model + print("\n** Predicted Text **\n") + + for output in app.stream_predicted_text_from_image(image): + if is_test: + continue + print(output[0], end="\r") + # Sleep to accentuate the "streaming" affect in terminal output. + time.sleep(0.1) + + print("\n") + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/trocr/export.py b/qai_hub_models/models/trocr/export.py new file mode 100644 index 00000000..bf80d2b7 --- /dev/null +++ b/qai_hub_models/models/trocr/export.py @@ -0,0 +1,217 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Mapping, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.trocr import Model +from qai_hub_models.utils.args import ( + export_parser, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + +ALL_COMPONENTS = ["TrOCREncoder", "TrOCRDecoder"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[ + str, Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] +] | List[str]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` + + Returns: + A Mapping from component_name to a 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "trocr" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + component_arg = components + components = components or ALL_COMPONENTS + for component in components: + if component not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "trocr", + "TrOCR", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + component_arg, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + components_dict = {} + if "TrOCREncoder" in components: + components_dict["TrOCREncoder"] = model.encoder + if "TrOCRDecoder" in components: + components_dict["TrOCRDecoder"] = model.decoder + + compile_jobs = {} + for component_name, component in components_dict.items(): + # Trace the model + input_spec = component.get_input_spec() + source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + + # 2. Compile the models to an on-device asset + model_compile_options = component.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input pixel_values" + ) + print(f"Optimizing model {component_name} to run on-device.") + compile_jobs[component_name] = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=f"{component_name}", + options=model_compile_options, + ) + + # 3. Profile the model assets on real devices + profile_jobs = {} + if not skip_profiling: + for component_name in components: + print(f"Profiling model {component_name} on a hosted device.") + profile_jobs[component_name] = hub.submit_profile_job( + model=compile_jobs[component_name].get_target_model(), + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_jobs = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + sample_inputs = components_dict[component_name].sample_inputs() + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "pixel_values", sample_inputs, target_runtime + ) + inference_jobs[component_name] = hub.submit_inference_job( + model=compile_jobs[component_name].get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 5. Download the model assets to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + for component_name, compile_job in compile_jobs.items(): + target_model = compile_job.get_target_model() + target_model.download( + str(output_path / f"{model_name}_{component_name}.tflite") + ) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + for component_name in components: + inference_job = inference_jobs[component_name] + sample_inputs = components_dict[component_name].sample_inputs() + torch_out = torch_inference(components_dict[component_name], sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return { + component_name: ( + compile_jobs[component_name], + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser( + model_cls=Model, components=ALL_COMPONENTS, supports_qnn=False + ) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/trocr/info.yaml b/qai_hub_models/models/trocr/info.yaml new file mode 100644 index 00000000..8a31ae98 --- /dev/null +++ b/qai_hub_models/models/trocr/info.yaml @@ -0,0 +1,36 @@ +name: TrOCR +# id must match with the model dir name in qai_hub_models +id: trocr +status: public +headline: Transformer based model for state-of-the-art optical character recognition + (OCR) on both printed and handwritten text. +domain: Multimodal +description: End-to-end text recognition approach with pre-trained image transformer + and text transformer models for both image understanding and wordpiece-level text + generation. +use_case: Image To Text +tags: [] +research_paper: https://arxiv.org/abs/2109.10282 +research_paper_title: 'TrOCR: Transformer-based Optical Character Recognition with + Pre-trained Models' +license: https://github.com/microsoft/unilm/blob/master/LICENSE +source_repo: https://huggingface.co/microsoft/trocr-small-stage1 +technical_details: + Model checkpoint: trocr-small-stage1 + Input resolution: 320x320 + Number of parameters (TrOCREncoder): 23.0M + Model size (TrOCREncoder): 87.8 MB + Number of parameters (TrOCRDecoder): 38.3M + Model size (TrOCRDecoder): 146 MB +applicable_scenarios: +- Publishing +- Healthcare +- Document Management +form_factors: +- Phone +- Tablet +related_models: [] +has_static_banner: yes +has_animated_banner: yes +license_type: mit +dataset: [] diff --git a/qai_hub_models/models/trocr/model.py b/qai_hub_models/models/trocr/model.py new file mode 100644 index 00000000..1b66edf6 --- /dev/null +++ b/qai_hub_models/models/trocr/model.py @@ -0,0 +1,253 @@ +from __future__ import annotations + +import copy +from typing import Callable, List, Tuple + +import numpy as np +import torch +from transformers import TrOCRProcessor, VisionEncoderDecoderModel +from transformers.models.trocr.modeling_trocr import ( + PreTrainedModel, + TrOCRAttention, + TrOCRForCausalLM, +) + +from qai_hub_models.utils.base_model import BaseModel, CollectionModel +from qai_hub_models.utils.input_spec import InputSpec + +HUGGINGFACE_TROCR_MODEL = "microsoft/trocr-small-stage1" +MODEL_ID = __name__.split(".")[-2] +TROCR_BATCH_SIZE = 1 +TROCR_EXPORT_SEQ_LEN = 1 # -1 TODO(#5428): Dynamic sequence length support. This limits the input size to a seq len of 1. +MODEL_ASSET_VERSION = 1 + +""" +Traceable modules used by TrOCRApp +""" +KVCache = Tuple[torch.Tensor, ...] # Export friendly + + +class TrOCR(CollectionModel): + def __init__( + self, + encoder: Callable[[torch.Tensor], KVCache], + decoder: Callable[..., Tuple[torch.Tensor, ...]], + io_processor: TrOCRProcessor, + pad_token_id: int, + eos_token_id: int, + start_token_id: int, + max_seq_len: int, + ): + self.encoder = encoder + self.decoder = decoder + self.io_processor = io_processor + self.pad_token_id = pad_token_id + self.eos_token_id = eos_token_id + self.start_token_id = start_token_id + self.max_seq_len = max_seq_len + + @classmethod + def from_pretrained(cls, hf_trocr_model: str = HUGGINGFACE_TROCR_MODEL) -> TrOCR: + # Load Huggingface source + source_model = VisionEncoderDecoderModel.from_pretrained( + hf_trocr_model, return_dict=False + ) + io_processor = TrOCRProcessor.from_pretrained(hf_trocr_model) + return TrOCR.from_source_model(source_model, io_processor) # type: ignore + + @staticmethod + def from_source_model( + source_model: VisionEncoderDecoderModel, io_processor: TrOCRProcessor + ) -> TrOCR: + encoder = TrOCREncoder(source_model.encoder, source_model.decoder) # type: ignore + decoder = TrOCRDecoder(source_model.decoder) # type: ignore + return TrOCR( + encoder, + decoder, + io_processor, + source_model.generation_config.pad_token_id, # type: ignore + source_model.generation_config.eos_token_id, # type: ignore + (source_model.generation_config.decoder_start_token_id or source_model.generation_config.bos_token_id), # type: ignore + source_model.generation_config.max_length, # type: ignore + ) + + +class TrOCREncoder(BaseModel): + """Vision encoder that returns the decoder's cross attn cache state.""" + + def __init__(self, encoder: PreTrainedModel, decoder: TrOCRForCausalLM): + super().__init__() + self.encoder = encoder + self.decoder = decoder + self.cross_attn_kv_shape: Callable = decoder.model.decoder.layers[0].encoder_attn._shape # type: ignore + + def forward( + self, + pixel_values: torch.FloatTensor, + ) -> KVCache: + """ + Run the encoder on `pixel_values`, and produce a cross attention key/value cache that can be used as decoder input. + + Parameters: + pixel_values: Pixel values pre-processed for encoder consumption. + + Returns: + cross_attn_kv_cache: Tuple[kv_cache_cross_attn_0_key, kv_cache_cross_attn_0_val, kv_cache_cross_attn_1_key, ...] + KV Cache for cross attention layers. + len(tuple) == 2 * number of source model decoder layers. + """ + encoder_hidden_states = self.encoder(pixel_values, return_dict=False)[0] + kv_cache = [] + batch_size = encoder_hidden_states.shape[0] + for layer in self.decoder.model.decoder.layers: + layer_attn: TrOCRAttention = layer.encoder_attn # type: ignore + key_states = self.cross_attn_kv_shape( + layer_attn.k_proj(encoder_hidden_states), -1, batch_size + ) + value_states = self.cross_attn_kv_shape( + layer_attn.v_proj(encoder_hidden_states), -1, batch_size + ) + kv_cache.append(key_states) + kv_cache.append(value_states) + + return (*kv_cache,) # convert list to tuple for export + + def get_input_spec(self) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declared + # the model input specification upon submitting a profile job. + return {"pixel_values": ((TROCR_BATCH_SIZE, 3, 384, 384), "float32")} + + @classmethod + def from_pretrained(cls): + return TrOCR.from_pretrained().encoder + + +class TrOCRDecoder(BaseModel): + """ + Wraps Vision decoder in an export-friendly interface. + + Inputs: (input_ids, KV Cache (unrolled in order generated by combine_kv_caches in app.py)) + Outputs: (output_ids, Updated Attention KV Cache) + """ + + def __init__(self, decoder: TrOCRForCausalLM): + super().__init__() + self.decoder = copy.deepcopy(decoder) + # Delete unused layers that exist only to generate initial KV cache. + self.num_decoder_layers = len(self.decoder.model.decoder.layers) + for layer in self.decoder.model.decoder.layers: + layer_attn: TrOCRAttention = layer.encoder_attn # type: ignore + layer_attn.k_proj = None # type: ignore + layer_attn.v_proj = None # type: ignore + self.max_position_embeddings: int = self.decoder.config.max_position_embeddings # type: ignore + self.decoder_attention_heads: int = decoder.config.decoder_attention_heads + self.embeddings_per_head: int = ( + decoder.config.d_model // decoder.config.decoder_attention_heads + ) + + def forward( + self, input_ids: torch.IntTensor, *kv_cache_args, **kv_cache_kwargs + ) -> Tuple[torch.Tensor, ...]: + """ + Generate the next token in the predicted output text sequence. + + Parameters: + input_ids : torch.IntTensor + Next token ID in each batch sequence (always shape (batch_size, 1)) + + kv_cache: Tuple[kv_cache_attn_0_key, kv_cache_attn_0_val, + kv_cache_cross_attn_0_key, kv_cache_cross_attn_0_val, + kv_cache_attn_1_key, ...] + Combined KV Cache generated by combine_kv_caches in app.py. + len(tuple) == 4 * number of source model decoder layers. + + Returns: + outputs : Tuple[ + predicted_ids + Next predicted token. + kv_cache_attn_0_key, kv_cache_attn_0_val, kv_cache_attn_1_key, ... + Updated KV cache for attention layers. Count == 2 * number of source model decoder layers. + ] + """ + # encoder_hidden_states is not used by the network when kv_cache is set. + # + # Unfortunately the underlying huggingface code does not allow us to + # get rid of the input entirely, because the decoder layer implementation uses its existance + # to determine if it should include cross-attention layers. + # + # Therefore, we set the hidden state to shape [1] in this case to minimize footprint. + # It will go away when traced. + encoder_hidden_states = torch.from_numpy(np.array([1])) + + # Convert KV Cache from export friendly format to decoder format + kv_cache: List[Tuple[torch.Tensor, ...]] = [] + curr_tuple: List[torch.Tensor] = [] + for arg in kv_cache_args or kv_cache_kwargs.values(): + curr_tuple.append(arg) + if len(curr_tuple) == 4: + kv_cache.append((*curr_tuple,)) + curr_tuple = [] + kv_cache = (*kv_cache,) # type: ignore + + # Run decoder + outputs = self.decoder( + input_ids=input_ids, + encoder_hidden_states=encoder_hidden_states, + return_dict=False, + use_cache=True, + past_key_values=kv_cache, + ) + + # KV Cache conversion to export-friendly format (tuple of tensors) + # Don't output cross attn KV cache because it does not change. + out_kv_cache: List[torch.Tensor] = [] + for layer_cache in outputs[1]: + out_kv_cache = out_kv_cache + list(layer_cache)[:2] + + # Argmax Logits, Sequence-Only (Attn) KV Cache + return ( + torch.argmax(torch.squeeze(outputs[0], dim=1), dim=-1), + *out_kv_cache, + ) + + def get_input_spec(self) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + input_ids_spec = ((1, 1), "int32") + + attn_cache_spec = ( + ( + TROCR_BATCH_SIZE, + self.decoder_attention_heads, + TROCR_EXPORT_SEQ_LEN, + self.embeddings_per_head, + ), + "float32", + ) + + cross_attn_cache_spec = ( + ( + TROCR_BATCH_SIZE, + self.decoder_attention_heads, + 578, # TODO: Can we get this programatically? + self.embeddings_per_head, + ), + "float32", + ) + + decoder_input_specs: InputSpec = {"input_ids": input_ids_spec} + for i in range(0, self.num_decoder_layers): + decoder_input_specs[f"kv_{i}_attn_key"] = attn_cache_spec + decoder_input_specs[f"kv_{i}_attn_val"] = attn_cache_spec + decoder_input_specs[f"kv_{i}_cross_attn_key"] = cross_attn_cache_spec + decoder_input_specs[f"kv_{i}_cross_attn_val"] = cross_attn_cache_spec + + return decoder_input_specs + + @classmethod + def from_pretrained(cls): + return TrOCR.from_pretrained().decoder diff --git a/qai_hub_models/models/trocr/perf.yaml b/qai_hub_models/models/trocr/perf.yaml new file mode 100644 index 00000000..b9cea027 --- /dev/null +++ b/qai_hub_models/models/trocr/perf.yaml @@ -0,0 +1,107 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: TrOCREncoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 244369.0 + throughput: 4.092172084020477 + estimated_peak_memory_range: + min: 7294976 + max: 10455296 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 627 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 627 + job_id: j2p0m26eg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:31:39.426796Z' +- name: TrOCRDecoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 2820.0 + throughput: 354.6099290780142 + estimated_peak_memory_range: + min: 20480 + max: 2212720 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 394 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 394 + job_id: j1p8em18p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:34:45.126605Z' diff --git a/qai_hub_models/models/trocr/requirements.txt b/qai_hub_models/models/trocr/requirements.txt new file mode 100644 index 00000000..3a308074 --- /dev/null +++ b/qai_hub_models/models/trocr/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.33.2 +sentencepiece diff --git a/qai_hub_models/models/trocr/test.py b/qai_hub_models/models/trocr/test.py new file mode 100644 index 00000000..d6e53443 --- /dev/null +++ b/qai_hub_models/models/trocr/test.py @@ -0,0 +1,60 @@ +import numpy as np +import pytest +import torch +from transformers import TrOCRProcessor, VisionEncoderDecoderModel + +from qai_hub_models.models.trocr.app import TrOCRApp +from qai_hub_models.models.trocr.demo import DEFAULT_SAMPLE_IMAGE +from qai_hub_models.models.trocr.demo import main as demo_main +from qai_hub_models.models.trocr.model import HUGGINGFACE_TROCR_MODEL, TrOCR +from qai_hub_models.utils.asset_loaders import load_image + +IMAGE_TEXT = 'industrial " Mr. Brown commented icity., letus have a' + + +@pytest.fixture(scope="module") +def source_huggingface_model() -> VisionEncoderDecoderModel: + return VisionEncoderDecoderModel.from_pretrained( + HUGGINGFACE_TROCR_MODEL, return_dict=False + ) # type: ignore + + +@pytest.fixture(scope="module") +def trocr_app(source_huggingface_model: VisionEncoderDecoderModel) -> TrOCRApp: + # Load Huggingface source + source_model = source_huggingface_model + io_processor = TrOCRProcessor.from_pretrained(HUGGINGFACE_TROCR_MODEL) + + # Load Application + return TrOCRApp(TrOCR.from_source_model(source_model, io_processor)) + + +@pytest.fixture(scope="module") +def processed_sample_image(trocr_app: TrOCRApp) -> torch.Tensor: + """Huggingface-provided image preprocessing and token decoding.""" + return trocr_app.preprocess_image(load_image(DEFAULT_SAMPLE_IMAGE)) + + +def test_predict_text_from_image( + trocr_app: TrOCRApp, processed_sample_image: torch.Tensor +): + """Verify our driver produces the correct sentences from a given image input.""" + assert trocr_app.predict_text_from_image(processed_sample_image)[0] == IMAGE_TEXT + + +def test_task( + source_huggingface_model: VisionEncoderDecoderModel, + trocr_app: TrOCRApp, + processed_sample_image: torch.Tensor, +): + """Verify that raw (numeric) outputs of both networks are the same.""" + source_out = source_huggingface_model.generate(processed_sample_image).numpy() + qaihm_out = trocr_app.predict_text_from_image( + processed_sample_image, raw_output=True + ) + + assert np.allclose(source_out, qaihm_out) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/unet_segmentation/README.md b/qai_hub_models/models/unet_segmentation/README.md new file mode 100644 index 00000000..7e1b7c39 --- /dev/null +++ b/qai_hub_models/models/unet_segmentation/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Unet-Segmentation: Real-time segmentation optimized for mobile and edge](https://aihub.qualcomm.com/models/unet_segmentation) + +UNet is a machine learning model that produces a segmentation mask for an image. The most basic use case will label each pixel in the image as being in the foreground or the background. More advanced usage will assign a class label to each pixel. + +This is based on the implementation of Unet-Segmentation found +[here](https://github.com/milesial/Pytorch-UNet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/unet_segmentation). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.unet_segmentation.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.unet_segmentation.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Unet-Segmentation can be found + [here](https://github.com/milesial/Pytorch-UNet/blob/master/LICENSE). + + +## References +* [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597) +* [Source Model Implementation](https://github.com/milesial/Pytorch-UNet) diff --git a/qai_hub_models/models/unet_segmentation/__init__.py b/qai_hub_models/models/unet_segmentation/__init__.py new file mode 100644 index 00000000..7c9d10af --- /dev/null +++ b/qai_hub_models/models/unet_segmentation/__init__.py @@ -0,0 +1,2 @@ +from .model import MODEL_ID # noqa: F401 +from .model import UNet as Model # noqa: F401 diff --git a/qai_hub_models/models/unet_segmentation/app.py b/qai_hub_models/models/unet_segmentation/app.py new file mode 100644 index 00000000..13487aa0 --- /dev/null +++ b/qai_hub_models/models/unet_segmentation/app.py @@ -0,0 +1,38 @@ +from typing import Callable + +import torch +from PIL.Image import Image + +from qai_hub_models.utils.image_processing import preprocess_PIL_image + + +class UNetSegmentationApp: + """ + This class consists of light-weight "app code" that is required to + perform end to end inference with UNet. + + For a given image input, the app will: + * Pre-process the image (resize and normalize) + * Run UNet Inference + * Convert the raw output into segmented image. + """ + + def __init__(self, model: Callable[[torch.Tensor], torch.Tensor]): + self.model = model + + def predict(self, image: Image) -> torch.Tensor: + """ + From the provided image or tensor, generate the segmented mask. + + Parameters: + image: A PIL Image in RGB format. + + Returns: + mask: Segmented mask as numpy array. + """ + + img = preprocess_PIL_image(image) + with torch.no_grad(): + out = self.model(img) + mask = out.argmax(dim=1) + return mask[0].bool().numpy() diff --git a/qai_hub_models/models/unet_segmentation/demo.py b/qai_hub_models/models/unet_segmentation/demo.py new file mode 100644 index 00000000..2b79c011 --- /dev/null +++ b/qai_hub_models/models/unet_segmentation/demo.py @@ -0,0 +1,69 @@ +from typing import Callable + +import torch +from PIL.Image import fromarray + +from qai_hub_models.models.unet_segmentation.app import UNetSegmentationApp +from qai_hub_models.models.unet_segmentation.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + UNet, +) +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, PathType, load_image +from qai_hub_models.utils.display import display_or_save_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "unet_test_image.jpg" +) + + +# Run unet segmentation app end-to-end on a sample image. +# The demo will display the predicted mask in a window. +def unet_demo( + model: Callable[..., Callable[[torch.Tensor, torch.Tensor], torch.Tensor]], + default_image: PathType, + is_test: bool = False, +): + # Demo parameters + parser = get_model_cli_parser(UNet) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=None, + help="File path or URL to an input image to use for the demo.", + ) + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, model.get_model_id()) + + # Load image & model + model = demo_model_from_cli_args(UNet, args) + print("Model loaded from pre-trained weights.") + image = load_image( + args.image or default_image, verbose=True, desc="sample input image" + ) + + # Run app + app = UNetSegmentationApp(model) + mask = fromarray(app.predict(image)) + if not is_test: + display_or_save_image(image, args.output_dir, "input_image.png", "input image") + display_or_save_image(mask, args.output_dir, "mask.png", "mask") + + +def main(is_test: bool = False): + unet_demo( + UNet, + IMAGE_ADDRESS, + is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/unet_segmentation/export.py b/qai_hub_models/models/unet_segmentation/export.py new file mode 100644 index 00000000..5df69a07 --- /dev/null +++ b/qai_hub_models/models/unet_segmentation/export.py @@ -0,0 +1,193 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.unet_segmentation import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "unet_segmentation" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "unet_segmentation", + "Unet-Segmentation", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/unet_segmentation/info.yaml b/qai_hub_models/models/unet_segmentation/info.yaml new file mode 100644 index 00000000..e767745d --- /dev/null +++ b/qai_hub_models/models/unet_segmentation/info.yaml @@ -0,0 +1,35 @@ +name: Unet-Segmentation +# id must match with the model dir name in qai_hub_models +id: unet_segmentation +status: public +headline: Real-time segmentation optimized for mobile and edge. +domain: Computer Vision +description: UNet is a machine learning model that produces a segmentation mask for an image. The most basic use case will label each pixel in the image as being in the foreground or the background. More advanced usage will assign a class label to each pixel. +use_case: Semantic Segmentation +tags: + - backbone + - real-time +research_paper: https://arxiv.org/abs/1505.04597 +research_paper_title: "U-Net: Convolutional Networks for Biomedical Image Segmentation" +license: https://github.com/milesial/Pytorch-UNet/blob/master/LICENSE +source_repo: https://github.com/milesial/Pytorch-UNet +technical_details: + Number of parameters: 31M + Model size: 125 MB + Model checkpoint: unet_carvana_scale1.0_epoch2 + Input resolution: 224x224 +applicable_scenarios: + - Autonomous Vehicles + - Medical Imaging + - Factory Quality Control +related_models: + - fcn_resnet50 +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: no +license_type: gpl-3.0 +dataset: [] diff --git a/qai_hub_models/models/unet_segmentation/model.py b/qai_hub_models/models/unet_segmentation/model.py new file mode 100644 index 00000000..30f397bd --- /dev/null +++ b/qai_hub_models/models/unet_segmentation/model.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +from typing import Optional + +import torch + +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_torch +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_REPO = "milesial/Pytorch-UNet" +MODEL_TYPE = "unet_carvana" +MODEL_ASSET_VERSION = 1 +DEFAULT_WEIGHTS = CachedWebModelAsset( + "https://github.com/milesial/Pytorch-UNet/releases/download/v3.0/unet_carvana_scale1.0_epoch2.pth", + MODEL_ID, + MODEL_ASSET_VERSION, + "unet_carvana_scale1.0_epoch2.pth", +) + + +class UNet(BaseModel): + def __init__(self, net: torch.nn.Module) -> None: + super().__init__() + self.net = net + + @classmethod + def from_pretrained(cls, ckpt_url: Optional[str] = DEFAULT_WEIGHTS): + net = torch.hub.load( + MODEL_REPO, MODEL_TYPE, pretrained=False, scale=1.0, trust_repo=True + ) + if ckpt_url is not None: + state_dict = load_torch(ckpt_url) + net.load_state_dict(state_dict) + return cls(net.eval()) + + def forward(self, image: torch.Tensor): + """ + Run UNet on `image`, and produce a segmentation mask over the image. + + Parameters: + image: A [1, 3, H, W] image. + The smaller of H, W should be >= 16, the larger should be >=32 + Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + mask: Shape [1, n_classes, H, W] where H, W are the same as the input image. + n_classes is 2 for the default model. + + Each channel represents the raw logit predictions for a given class. + Taking the softmax over all channels for a given pixel gives the + probability distribution over classes for that pixel. + """ + return self.net(image) + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 224, + width: int = 224, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, num_channels, height, width), "float32")} diff --git a/qai_hub_models/models/unet_segmentation/perf.yaml b/qai_hub_models/models/unet_segmentation/perf.yaml new file mode 100644 index 00000000..42a0fc85 --- /dev/null +++ b/qai_hub_models/models/unet_segmentation/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Unet-Segmentation + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 7708.0 + throughput: 129.73533990659055 + estimated_peak_memory_range: + min: 442368 + max: 29540072 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 31 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 31 + job_id: j7gjr207p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 7735.0 + throughput: 129.2824822236587 + estimated_peak_memory_range: + min: 421888 + max: 282981312 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 52 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 52 + job_id: jlpe7wr75 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:43:41.073611Z' diff --git a/qai_hub_models/models/unet_segmentation/test.py b/qai_hub_models/models/unet_segmentation/test.py new file mode 100644 index 00000000..f7b93b75 --- /dev/null +++ b/qai_hub_models/models/unet_segmentation/test.py @@ -0,0 +1,32 @@ +import numpy as np +from PIL.Image import fromarray + +from qai_hub_models.models.unet_segmentation.app import UNetSegmentationApp +from qai_hub_models.models.unet_segmentation.demo import IMAGE_ADDRESS +from qai_hub_models.models.unet_segmentation.demo import main as demo_main +from qai_hub_models.models.unet_segmentation.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + UNet, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image + +OUTPUT_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_output.png" +) + + +def test_task(): + net = UNet.from_pretrained() + + img = load_image(IMAGE_ADDRESS) + mask = UNetSegmentationApp(net).predict(img) + + # Convert raw mask of 0s and 1s into a PIL Image + img = fromarray(mask) + expected_out = load_image(OUTPUT_ADDRESS) + np.testing.assert_allclose(np.array(img), np.array(expected_out)) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/vit/README.md b/qai_hub_models/models/vit/README.md new file mode 100644 index 00000000..f1c29bfc --- /dev/null +++ b/qai_hub_models/models/vit/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [VIT: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/vit) + +VIT is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of VIT found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/vit). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.vit.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.vit.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of VIT can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py) diff --git a/qai_hub_models/models/vit/__init__.py b/qai_hub_models/models/vit/__init__.py new file mode 100644 index 00000000..c343d518 --- /dev/null +++ b/qai_hub_models/models/vit/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import VIT as Model # noqa: F401 diff --git a/qai_hub_models/models/vit/demo.py b/qai_hub_models/models/vit/demo.py new file mode 100644 index 00000000..488c5bd8 --- /dev/null +++ b/qai_hub_models/models/vit/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.vit.model import VIT + + +def main(is_test: bool = False): + imagenet_demo(VIT, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/vit/export.py b/qai_hub_models/models/vit/export.py new file mode 100644 index 00000000..1c78dc83 --- /dev/null +++ b/qai_hub_models/models/vit/export.py @@ -0,0 +1,187 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.vit import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "vit" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "vit", + "VIT", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace( + model, make_torch_inputs(input_spec), check_trace=False + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/vit/info.yaml b/qai_hub_models/models/vit/info.yaml new file mode 100644 index 00000000..1c7fab52 --- /dev/null +++ b/qai_hub_models/models/vit/info.yaml @@ -0,0 +1,38 @@ +name: VIT +# id must match with the model dir name in qai_hub_models +id: vit +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: VIT is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone +research_paper: https://arxiv.org/abs/2010.11929 +research_paper_title: "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py +technical_details: + Number of parameters: 86.6M + Model size: 330 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/vit/model.py b/qai_hub_models/models/vit/model.py new file mode 100644 index 00000000..37c072cd --- /dev/null +++ b/qai_hub_models/models/vit/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class VIT(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.vit_b_16(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/vit/perf.yaml b/qai_hub_models/models/vit/perf.yaml new file mode 100644 index 00000000..3eebfd79 --- /dev/null +++ b/qai_hub_models/models/vit/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: VIT + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 135762.0 + throughput: 7.365831381388017 + estimated_peak_memory_range: + min: 147456 + max: 3331880 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 557 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 557 + job_id: j1gly2ll5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:21:41.057280Z' diff --git a/qai_hub_models/models/vit/test.py b/qai_hub_models/models/vit/test.py new file mode 100644 index 00000000..335475a0 --- /dev/null +++ b/qai_hub_models/models/vit/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( # run_imagenet_classifier_trace_test, + run_imagenet_classifier_test, +) +from qai_hub_models.models.vit.demo import main as demo_main +from qai_hub_models.models.vit.model import MODEL_ID, VIT + + +def test_task(): + run_imagenet_classifier_test(VIT.from_pretrained(), MODEL_ID) + + +# TODO: Fix this export test. +# def test_trace(): +# run_imagenet_classifier_trace_test(VIT.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/whisper_asr/README.md b/qai_hub_models/models/whisper_asr/README.md new file mode 100644 index 00000000..593bd344 --- /dev/null +++ b/qai_hub_models/models/whisper_asr/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Whisper-Base: Automatic speech recognition (ASR) model for multilingual transcription as well as translation](https://aihub.qualcomm.com/models/whisper_asr) + +State-of-art model encoder-decoder transformer. The encoder takes an audio chunk (around 30 second) converted to a log-Mel spectrogram. The decoder predicts the corresponding text caption intermixed with special tokens that can be used to direct the single model to perform various speech tasks. + +This is based on the implementation of Whisper-Base found +[here](https://github.com/openai/whisper/tree/main). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/whisper_asr). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[whisper_asr]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.whisper_asr.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.whisper_asr.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Whisper-Base can be found + [here](https://github.com/openai/whisper/blob/main/LICENSE). + + +## References +* [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) +* [Source Model Implementation](https://github.com/openai/whisper/tree/main) diff --git a/qai_hub_models/models/whisper_asr/__init__.py b/qai_hub_models/models/whisper_asr/__init__.py new file mode 100644 index 00000000..741810ae --- /dev/null +++ b/qai_hub_models/models/whisper_asr/__init__.py @@ -0,0 +1,3 @@ +from .app import WhisperApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import Whisper as Model # noqa: F401 diff --git a/qai_hub_models/models/whisper_asr/app.py b/qai_hub_models/models/whisper_asr/app.py new file mode 100644 index 00000000..1bdeb5e9 --- /dev/null +++ b/qai_hub_models/models/whisper_asr/app.py @@ -0,0 +1,341 @@ +from __future__ import annotations + +from typing import List, Tuple + +import numpy as np +import torch +import whisper # type: ignore +from scipy import special as scipy_special # type: ignore + +from qai_hub_models.models.whisper_asr.model import Whisper +from qai_hub_models.utils.model_adapters import TorchNumpyAdapter + +# hard-coded audio hyperparameters +SAMPLE_RATE = 16000 +N_FFT = 400 +N_MELS = 80 +HOP_LENGTH = 160 +CHUNK_LENGTH = 30 +N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk + + +class WhisperApp: + """ + WhisperApp runs Whisper encoder and decoder to transcribe audio + represented as mel spectrogram. It support all model variants of + OpenAI Whisper. + """ + + def __init__(self, whisper: Whisper): + decoder = whisper.decoder + encoder = whisper.encoder + self.num_decoder_blocks = whisper.num_decoder_blocks + self.attention_dim = whisper.attention_dim + + # Wraps torch Module so it takes np ndarray as input and outputs + if isinstance(encoder, torch.nn.Module): + self.encoder = TorchNumpyAdapter(encoder) + else: + self.encoder = encoder + if isinstance(decoder, torch.nn.Module): + self.decoder = TorchNumpyAdapter(decoder) + else: + self.decoder = decoder + + def predict(self, *args, **kwargs): + # See transcribe. + return self.transcribe(*args, **kwargs) + + def transcribe(self, mel_input: np.ndarray) -> str: + """ + Transcribe an audio to text. + + Parameters: + + - mel_input: of shape (1, 80, 3000). Mel spectrogram of 30s audio. + + Returns: + + - transcribed texts + """ + cross_attn_cache = self.encoder(mel_input) + # Start decoding + # coreml only takes float tensors + x = np.array([[TOKEN_SOT]]) + decoded_tokens = [TOKEN_SOT] + cache_tensor = np.array([], dtype=np.float32).reshape( + (1, 0, self.attention_dim) + ) + self_attn_cache = [cache_tensor] * 2 * self.num_decoder_blocks + + sample_len = 224 # max # of tokens to sample + sum_logprobs = 0 + for i in range(sample_len): + decoder_out = self.decoder(x, *cross_attn_cache, *self_attn_cache) + # logit has shape (1, decoded_len, 51864) + logits = decoder_out[0] + self_attn_cache = decoder_out[1:] # type: ignore + # logit has shape (51864,) + logits = logits[0, -1] # consider only the last token + + # Filters + # SuppressBlank + if i == 0: + logits[[TOKEN_EOT, TOKEN_BLANK]] = -np.inf + # SuppressTokens + logits[NON_SPEECH_TOKENS] = -np.inf + + logits, logprobs = apply_timestamp_rules(logits, decoded_tokens) + + if i == 0: + # detect no_speech + no_speech_prob = np.exp(logprobs[TOKEN_NO_SPEECH]) + if no_speech_prob > NO_SPEECH_THR: + break + + # temperature = 0 + next_token = np.argmax(logits) + if next_token == TOKEN_EOT: + break + + sum_logprobs += logprobs[next_token] + x = np.array([[next_token]]) + decoded_tokens.append(int(next_token)) + + tokenizer = whisper.decoding.get_tokenizer( + multilingual=False, language="en", task="transcribe" + ) + + text = tokenizer.decode(decoded_tokens[1:]) # remove TOKEN_SOT + return text.strip() + + +# Whisper constants +TOKEN_SOT = 50257 # Start of transcript +TOKEN_EOT = 50256 # end of transcript +TOKEN_BLANK = 220 # " " +TOKEN_NO_TIMESTAMP = 50362 +TOKEN_TIMESTAMP_BEGIN = 50363 +TOKEN_NO_SPEECH = 50361 + +# Above this prob we deem there's no speech in the audio +NO_SPEECH_THR = 0.6 + +# https://github.com/openai/whisper/blob/v20230314/whisper/decoding.py#L600 +NON_SPEECH_TOKENS = [ + 1, + 2, + 7, + 8, + 9, + 10, + 14, + 25, + 26, + 27, + 28, + 29, + 31, + 58, + 59, + 60, + 61, + 62, + 63, + 90, + 91, + 92, + 93, + 357, + 366, + 438, + 532, + 685, + 705, + 796, + 930, + 1058, + 1220, + 1267, + 1279, + 1303, + 1343, + 1377, + 1391, + 1635, + 1782, + 1875, + 2162, + 2361, + 2488, + 3467, + 4008, + 4211, + 4600, + 4808, + 5299, + 5855, + 6329, + 7203, + 9609, + 9959, + 10563, + 10786, + 11420, + 11709, + 11907, + 13163, + 13697, + 13700, + 14808, + 15306, + 16410, + 16791, + 17992, + 19203, + 19510, + 20724, + 22305, + 22935, + 27007, + 30109, + 30420, + 33409, + 34949, + 40283, + 40493, + 40549, + 47282, + 49146, + 50257, + 50357, + 50358, + 50359, + 50360, + 50361, +] + +SAMPLE_BEGIN = 1 # first token is TOKEN_SOT + +# https://github.com/openai/whisper/blob/v20230314/whisper/decoding.py#L545 +precision = 0.02 # in second +max_initial_timestamp = 1.0 # in second +max_initial_timestamp_index = int(max_initial_timestamp / precision) + + +def apply_timestamp_rules( + logits: np.ndarray, tokens: List[int] +) -> Tuple[np.ndarray, float]: + """ + When predicting timestamps, there are a few post processing rules / + heuristics to ensure well-formed timestamps. See in-line comments for details + + Args: + - logits: of shape (51864,) + + Returns: + + - modified logits + - log probability of modified logits (log(softmax(logits))) + """ + # Require producing timestamp + logits[TOKEN_NO_TIMESTAMP] = -np.inf + + # timestamps have to appear in pairs, except directly before EOT + seq = tokens[SAMPLE_BEGIN:] + last_was_timestamp = len(seq) >= 1 and seq[-1] >= TOKEN_TIMESTAMP_BEGIN + penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= TOKEN_TIMESTAMP_BEGIN + if last_was_timestamp: + if penultimate_was_timestamp: # has to be non-timestamp + logits[TOKEN_TIMESTAMP_BEGIN:] = -np.inf + else: # cannot be normal text tokens + logits[:TOKEN_EOT] = -np.inf + + timestamps = [t for t in tokens if t >= TOKEN_TIMESTAMP_BEGIN] + if len(timestamps) > 0: + # timestamps shouldn't decrease; forbid timestamp tokens smaller than the last + # also force each segment to have a nonzero length, to prevent infinite looping + if last_was_timestamp and not penultimate_was_timestamp: + timestamp_last = timestamps[-1] + else: + timestamp_last = timestamps[-1] + 1 + logits[TOKEN_TIMESTAMP_BEGIN:timestamp_last] = -np.inf + + if len(tokens) == SAMPLE_BEGIN: + # suppress generating non-timestamp tokens at the beginning + logits[:TOKEN_TIMESTAMP_BEGIN] = -np.inf + + # apply the `max_initial_timestamp` option + last_allowed = TOKEN_TIMESTAMP_BEGIN + max_initial_timestamp_index + logits[(last_allowed + 1) :] = -np.inf + + # if sum of probability over timestamps is above any other token, sample timestamp + logprobs = scipy_special.log_softmax(logits) + timestamp_logprob = scipy_special.logsumexp(logprobs[TOKEN_TIMESTAMP_BEGIN:]) + max_text_token_logprob = logprobs[:TOKEN_TIMESTAMP_BEGIN].max() + if timestamp_logprob > max_text_token_logprob: + # Mask out all but timestamp tokens + logits[:TOKEN_TIMESTAMP_BEGIN] = -np.inf + + return logits, logprobs + + +def load_audio(mel_filter: np.ndarray, audio_path: str) -> np.ndarray: + """ + Load audio to a mel spectrogram. + """ + with np.load(audio_path) as f: + audio_np = f["audio"] + # Pad 30-seconds of silence to the input audio, for slicing + input_feature = log_mel_spectrogram(mel_filter, audio_np, pad_to_length=N_SAMPLES) + # input_feature has fixed shape [1, 80, 3000]. 80 is + # spectrogram feature dim, 3000 is due to Whisper only takes + # 30 seconds input represented as 10ms spectrogram segments + assert input_feature.shape == (1, 80, 3000) + return input_feature + + +def load_mel_filter(mel_filter_path: str) -> np.ndarray: + with np.load(mel_filter_path) as f: + return f["mel_80"] + + +# Adopted from https://github.com/openai/whisper/blob/main/whisper/audio.py +def log_mel_spectrogram( + mel_filter: np.ndarray, + audio_np: np.ndarray, + pad_to_length: int, +) -> np.ndarray: + """ + Compute the log-Mel spectrogram of + + Parameters + ---------- + audio_np: np.ndarray, shape = (*) + + pad_to_length: int + Add zero samples to the right till this length. No op if + len(audio) >= pad_to_length + + Returns + ------- + np.ndarray, shape = (1, 80, n_frames) + A Tensor that contains the Mel spectrogram. n_frames = 3000 for whisper + """ + audio = torch.from_numpy(audio_np) + assert isinstance(audio, torch.Tensor) + + if pad_to_length is not None: + padding = pad_to_length - len(audio) + if padding > 0: + audio = torch.nn.functional.pad(audio, (0, padding)) + window = torch.hann_window(N_FFT) + stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True) + magnitudes = stft[..., :-1].abs() ** 2 + + mel_spec = torch.from_numpy(mel_filter) @ magnitudes + + log_spec = torch.clamp(mel_spec, min=1e-10).log10() + log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) + log_spec = (log_spec + 4.0) / 4.0 + return log_spec.unsqueeze(0).detach().float().numpy() diff --git a/qai_hub_models/models/whisper_asr/demo.py b/qai_hub_models/models/whisper_asr/demo.py new file mode 100644 index 00000000..0e704c45 --- /dev/null +++ b/qai_hub_models/models/whisper_asr/demo.py @@ -0,0 +1,38 @@ +from qai_hub_models.models.whisper_asr.app import ( + WhisperApp, + load_audio, + load_mel_filter, +) +from qai_hub_models.models.whisper_asr.model import ( + MEL_FILTER_PATH, + MODEL_ASSET_VERSION, + MODEL_ID, + Whisper, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +TEST_AUDIO_PATH = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "audio/jfk.npz" +) + + +def main(): + # For other model sizes, see https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17 + app = WhisperApp(Whisper.from_pretrained()) + TEST_AUDIO_PATH.fetch() + MEL_FILTER_PATH.fetch() + + # Load audio into mel spectrogram + mel_filter_path = MEL_FILTER_PATH.path() + mel_filter = load_mel_filter(mel_filter_path) + + audio_path = TEST_AUDIO_PATH.path() + mel_input = load_audio(mel_filter, audio_path) + + # Perform transcription + transcription = app.transcribe(mel_input) + print("Transcription:", transcription) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/whisper_asr/export.py b/qai_hub_models/models/whisper_asr/export.py new file mode 100644 index 00000000..7f491853 --- /dev/null +++ b/qai_hub_models/models/whisper_asr/export.py @@ -0,0 +1,217 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Mapping, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.whisper_asr import Model +from qai_hub_models.utils.args import ( + export_parser, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + +ALL_COMPONENTS = ["WhisperEncoder", "WhisperDecoder"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[ + str, Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] +] | List[str]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` + + Returns: + A Mapping from component_name to a 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "whisper_asr" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + component_arg = components + components = components or ALL_COMPONENTS + for component in components: + if component not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "whisper_asr", + "Whisper-Base", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + component_arg, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + components_dict = {} + if "WhisperEncoder" in components: + components_dict["WhisperEncoder"] = model.encoder + if "WhisperDecoder" in components: + components_dict["WhisperDecoder"] = model.decoder + + compile_jobs = {} + for component_name, component in components_dict.items(): + # Trace the model + input_spec = component.get_input_spec() + source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + + # 2. Compile the models to an on-device asset + model_compile_options = component.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input audio" + ) + print(f"Optimizing model {component_name} to run on-device.") + compile_jobs[component_name] = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=f"{component_name}", + options=model_compile_options, + ) + + # 3. Profile the model assets on real devices + profile_jobs = {} + if not skip_profiling: + for component_name in components: + print(f"Profiling model {component_name} on a hosted device.") + profile_jobs[component_name] = hub.submit_profile_job( + model=compile_jobs[component_name].get_target_model(), + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_jobs = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + sample_inputs = components_dict[component_name].sample_inputs() + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "audio", sample_inputs, target_runtime + ) + inference_jobs[component_name] = hub.submit_inference_job( + model=compile_jobs[component_name].get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=f"{component_name}", + options=profile_options, + ) + + # 5. Download the model assets to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + for component_name, compile_job in compile_jobs.items(): + target_model = compile_job.get_target_model() + target_model.download( + str(output_path / f"{model_name}_{component_name}.tflite") + ) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + for component_name in components: + inference_job = inference_jobs[component_name] + sample_inputs = components_dict[component_name].sample_inputs() + torch_out = torch_inference(components_dict[component_name], sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + return { + component_name: ( + compile_jobs[component_name], + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser( + model_cls=Model, components=ALL_COMPONENTS, supports_qnn=False + ) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/whisper_asr/info.yaml b/qai_hub_models/models/whisper_asr/info.yaml new file mode 100644 index 00000000..1541b6f9 --- /dev/null +++ b/qai_hub_models/models/whisper_asr/info.yaml @@ -0,0 +1,37 @@ +name: Whisper-Base +# id must match with the model dir name in qai_hub_models +id: whisper_asr +status: public +headline: Automatic speech recognition (ASR) model for multilingual + transcription as well as translation. +domain: Audio +description: State-of-art model encoder-decoder transformer. The encoder takes + an audio chunk (around 30 second) converted to a log-Mel spectrogram. The + decoder predicts the corresponding text caption intermixed with special tokens + that can be used to direct the single model to perform various speech tasks. +use_case: Speech Recognition +tags: + - foundation +research_paper: https://cdn.openai.com/papers/whisper.pdf +research_paper_title: Robust Speech Recognition via Large-Scale Weak Supervision +license: https://github.com/openai/whisper/blob/main/LICENSE +source_repo: https://github.com/openai/whisper/tree/main +technical_details: + Encoder Number of parameters: 37.2M + Decoder Number of parameters: 29.6M + Model size: 270 MB + Model checkpoint: Tiny En + Input resolution: 80x3000 +applicable_scenarios: + - Smart Home + - Accessibility +related_models: + - huggingface_wavlm_base_plus +form_factors: + - Phone + - Tablet + - IoT +has_static_banner: yes +has_animated_banner: yes +license_type: mit +dataset: [] diff --git a/qai_hub_models/models/whisper_asr/model.py b/qai_hub_models/models/whisper_asr/model.py new file mode 100644 index 00000000..8ff0c818 --- /dev/null +++ b/qai_hub_models/models/whisper_asr/model.py @@ -0,0 +1,343 @@ +from __future__ import annotations + +from typing import Any, Callable, Dict, List, Optional, Tuple + +import torch +import whisper # type: ignore + +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import BaseModel, CollectionModel +from qai_hub_models.utils.input_spec import InputSpec + +MAX_DECODE_LEN = 448 + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +MEL_FILTER_PATH = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "openai_assets/mel_filters.npz" +) + + +class Whisper(CollectionModel): + def __init__( + self, + encoder: Callable[[torch.Tensor], List[torch.Tensor]], + decoder: Callable[..., Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]], + num_decoder_blocks: int, + attention_dim: int, + ): + self.encoder = encoder + self.decoder = decoder + self.num_decoder_blocks = num_decoder_blocks + self.attention_dim = attention_dim + + @classmethod + def from_pretrained(cls, model: str = "tiny.en"): + # For other model sizes, see https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17 + return cls.from_source_model(whisper.load_model(model)) + + @classmethod + def from_source_model(cls, whisper_model: Any): + encoder = WhisperEncoderInf(whisper_model) + decoder = WhisperDecoderInf(whisper_model.decoder) + num_decoder_blocks = len(decoder.blocks) + attention_dim = decoder.attention_dim + return cls(encoder, decoder, num_decoder_blocks, attention_dim) # type: ignore + + +class WhisperEncoderInf(BaseModel): + """ + WhisperEncoder optimized for export and inference. + + It takes audio input (mel) and directly produce cross attention + kv-cache. + """ + + def __init__(self, model: whisper.model.Whisper): + super().__init__() + self.model = model + + def forward(self, audio: torch.Tensor) -> List[torch.Tensor]: + # Return 2 * self.num_blocks tensors (k, v for each block) + encoder_out = self.model.encoder(audio) + res = [] + for residual_block in self.model.decoder.blocks: + res.append(residual_block.cross_attn.key(encoder_out)) + res.append(residual_block.cross_attn.value(encoder_out)) + return res + + def get_input_spec(self) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + return dict(audio=((1, 80, 3000), "float32")) + + @classmethod + def from_pretrained(cls): + return Whisper.from_pretrained().encoder + + +class WhisperDecoderInf(BaseModel): + """ + whisper.model.TextDecoder optimized for export and inference: + + Wraps `whisper.model.TextDecoder` to facilitate export: + + 1. kv cache inputs are individual tensors instead of a list of tensors + 2. kv cache inputs are required, not optional + """ + + def __init__(self, model: whisper.model.TextDecoder): + super().__init__() + assert isinstance(model, whisper.model.TextDecoder) + + # Wraps `ResidualAttentionBlock` in + # `ResidualAttentionBlockWrapper` + self.blocks = torch.nn.ModuleList( + [ResidualAttentionBlockWrapper(b) for b in model.blocks] + ) + for m in ["token_embedding", "ln"]: + self.add_module(m, getattr(model, m)) + for p in ["positional_embedding"]: + self.register_parameter(p, getattr(model, p)) + + @property + def attention_dim(self): + return self.blocks[0].attn_ln.weight.shape[0] + + def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs): + """ + Args: + + - x: torch.LongTensor, shape = (batch_size, <= n_ctx) + the text tokens + + - kv_cache_args: Tuple of length 4 * num_decoder_blocks. Elements are: + + b{i}_cross_attn_k: [1, 1500, attn_dim] + b{i}_cross_attn_v: [1, 1500, attn_dim] + + for i = 0, ..., num_blocks + + followed by + + b{i}_self_attn_k: [1, decoded_len, attn_dim] + b{i}_self_attn_v: [1, decoded_len, attn_dim] + + for i = 0, ..., num_blocks + + Returns: + + - logits: of shape [1, 1, 51864] + - b0_self_attn_k, b0_self_attn_v, b1_self_attn_k, ...: Updated self attn cache. + 2*num_decoder_blocks + """ + if not kv_cache_args: + kv_cache_args = list(kv_cache_kwargs.values()) + assert isinstance(self.token_embedding, torch.nn.Module) # for mypy + assert isinstance(self.ln, torch.nn.Module) # for mypy + assert isinstance(self.positional_embedding, torch.nn.Parameter) # for mypy + # Set up kv_cache + kv_cache = {} # torch.nn.Module -> torch.Tensor + num_blocks = len(self.blocks) + for i, block in enumerate(self.blocks): + kv_cache.update( + { + block.attn.key: kv_cache_args[2 * num_blocks + i * 2], + block.attn.value: kv_cache_args[2 * num_blocks + i * 2 + 1], + block.cross_attn.key: kv_cache_args[i * 2], + block.cross_attn.value: kv_cache_args[i * 2 + 1], + } + ) + offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 + x = ( + self.token_embedding(x) + + self.positional_embedding[offset : offset + x.shape[-1]] + ) + + # x shape: (1, 1, 384) + kv_cache_new = [] + for block in self.blocks: + x, k_cache, v_cache = block(x, kv_cache=kv_cache) + kv_cache_new.append(k_cache.float()) + kv_cache_new.append(v_cache.float()) + + x = self.ln(x) + logits = ( + x + @ torch.transpose( + self.token_embedding.weight.to(x.dtype), 0, 1 # type: ignore + ) + ).float() + + # shape: [1, 1, 51864] + return (logits,) + tuple(kv_cache_new) + + def get_input_spec(self) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + specs = dict(x=((1, 1), "int32")) + for i in range(len(self.blocks)): + specs[f"b{i}_cross_attn_k"] = ((1, 1500, self.attention_dim), "float32") + specs[f"b{i}_cross_attn_v"] = ((1, 1500, self.attention_dim), "float32") + + # Use mean length for profiling + mean_decode_len = MAX_DECODE_LEN // 2 + + for i in range(len(self.blocks)): + specs[f"b{i}_self_attn_k"] = ( + (1, mean_decode_len, self.attention_dim), + "float32", + ) + specs[f"b{i}_self_attn_v"] = ( + (1, mean_decode_len, self.attention_dim), + "float32", + ) + + return specs + + @classmethod + def from_pretrained(cls): + return Whisper.from_pretrained().decoder + + +class MHAWrapper(torch.nn.Module): + """ + Wrapper around whisper.model.MultiHeadAttention to leverage kv cache for + efficient inference. The original whisper.model.MultiHeadAttention doesn't + returns the updated kv cache but relies on pytorch hook which + cannot be exported for on-device inference. This wrapper fixes that. + + If attn_type == "self_attention", the kv cache is updated before they are returned. + + If attn_type == "cross_attention", the kv cache is returned without any update. + + Note that unlike whisper.model.MultiHeadAttention, this wrapper is + optimized for inference so it doesn't take mask as an input. + """ + + def __init__(self, model: whisper.model.MultiHeadAttention, attn_type: str): + """ + attn_type: one of {"self_attention", "cross_attention"} + """ + super().__init__() + assert isinstance(model, whisper.model.MultiHeadAttention) + self.attn_type = attn_type + self.n_head = model.n_head + for m in ["query", "key", "value", "out"]: + self.add_module(m, getattr(model, m)) + + def forward( + self, + x: torch.Tensor, + kv_cache: Dict[torch.nn.Module, torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Args: + + - x: shape [1, 1, attention_dim]. Input feature. + + - kv_cache: 4 * num_decoder_blocks entries representing self attention + and cross attention from all attention blocks. Each entry of shape + [1, decoded_len, attention_dim]. We'd only use cache relevant to this + particular attention layer and ignore other entries in the dict. + + Returns: + + - x_out: attention output + + - updated k, v cache: of shape [1, decoded_len+1, attention_dim] + """ + assert isinstance(self.query, torch.nn.Module) # for mypy + assert isinstance(self.key, torch.nn.Module) # for mypy + assert isinstance(self.value, torch.nn.Module) # for mypy + assert isinstance(self.out, torch.nn.Module) # for mypy + q = self.query(x) + + if self.attn_type == "self_attention": + k_cache = kv_cache[self.key] + v_cache = kv_cache[self.value] + k = self.key(x) + v = self.value(x) + k = torch.cat([k_cache, k], dim=1) + v = torch.cat([v_cache, v], dim=1) + else: # cross_attention + k, v = kv_cache[self.key], kv_cache[self.value] + + wv = qkv_attention(q, k, v, self.n_head) + # Return updated kv cache + return self.out(wv), k.detach(), v.detach() + + +def qkv_attention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + n_head: int, + mask: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Adapted from whisper.model.MultiHeadAttention.qkv_attention + """ + n_batch, n_ctx, n_state = q.shape + scale = (n_state // n_head) ** -0.25 + q = q.view(*q.shape[:2], n_head, -1).permute(0, 2, 1, 3) * scale + k = k.view(*k.shape[:2], n_head, -1).permute(0, 2, 3, 1) * scale + v = v.view(*v.shape[:2], n_head, -1).permute(0, 2, 1, 3) + + qk = q @ k + if mask is not None: + qk = qk + mask[:n_ctx, :n_ctx] + qk = qk.float() + + w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype) + return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2) + + +class ResidualAttentionBlockWrapper(torch.nn.Module): + """ + Wrapper around whisper.model.ResidualAttentionBlock to leverage kv cache + for efficient inference. The original whisper.model.ResidiualAttentionBlock + doesn't returns the updated kv cache but relies on pytorch hook which + cannot be exported for on-device inference. This wrapper fixes that. + """ + + def __init__(self, model: whisper.model.ResidualAttentionBlock): + super().__init__() + assert isinstance(model, whisper.model.ResidualAttentionBlock) + # Wraps `MultiheadAttention` to `MultiheadAttentionWrapper` + self.attn = MHAWrapper(model.attn, "self_attention") + self.cross_attn = MHAWrapper(model.cross_attn, "cross_attention") + for m in ["attn_ln", "cross_attn_ln", "mlp", "mlp_ln"]: + self.add_module(m, getattr(model, m)) + + def forward( + self, + x: torch.Tensor, + kv_cache: Dict[torch.nn.Module, torch.Tensor], + ): + """ + Args: Same as MHAWrapper + Returns: Same as MHAWrapper + """ + # Get updated self attention kv cache + assert isinstance(self.attn, torch.nn.Module) # for mypy + assert isinstance(self.attn_ln, torch.nn.Module) # for mypy + assert isinstance(self.cross_attn_ln, torch.nn.Module) # for mypy + assert isinstance(self.cross_attn, torch.nn.Module) # for mypy + assert isinstance(self.mlp, torch.nn.Module) # for mypy + assert isinstance(self.mlp_ln, torch.nn.Module) # for mypy + x_attn, k_cache, v_cache = self.attn(self.attn_ln(x), kv_cache=kv_cache) + x = x + x_attn + if self.cross_attn: + # Ignore cross attn kv cache which is constant (pre-computed in + # `WhisperCrossAttnKVCacheTorch`) + x_cross_attn, _, _ = self.cross_attn( + self.cross_attn_ln(x), kv_cache=kv_cache + ) + x = x + x_cross_attn + x = x + self.mlp(self.mlp_ln(x)) + return x, k_cache, v_cache diff --git a/qai_hub_models/models/whisper_asr/perf.yaml b/qai_hub_models/models/whisper_asr/perf.yaml new file mode 100644 index 00000000..f8e81783 --- /dev/null +++ b/qai_hub_models/models/whisper_asr/perf.yaml @@ -0,0 +1,107 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: WhisperEncoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 68918.0 + throughput: 14.50999738820047 + estimated_peak_memory_range: + min: 18612224 + max: 67240168 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 216 + layers_on_cpu: 0 + total_layers: 216 + job_id: j1p3z16z5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:31:20.996693Z' +- name: WhisperDecoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 7924.0 + throughput: 126.19888944977284 + estimated_peak_memory_range: + min: 3014656 + max: 5380072 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 293 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 293 + job_id: jwgoln8dg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:35:36.297844Z' diff --git a/qai_hub_models/models/whisper_asr/requirements.txt b/qai_hub_models/models/whisper_asr/requirements.txt new file mode 100644 index 00000000..75b1cf12 --- /dev/null +++ b/qai_hub_models/models/whisper_asr/requirements.txt @@ -0,0 +1,2 @@ +openai-whisper==20230314 +scipy diff --git a/qai_hub_models/models/whisper_asr/test.py b/qai_hub_models/models/whisper_asr/test.py new file mode 100644 index 00000000..f06fefd4 --- /dev/null +++ b/qai_hub_models/models/whisper_asr/test.py @@ -0,0 +1,79 @@ +import numpy as np +import pytest +import torch +import whisper + +from qai_hub_models.models.whisper_asr.app import ( + WhisperApp, + load_audio, + load_mel_filter, +) +from qai_hub_models.models.whisper_asr.demo import TEST_AUDIO_PATH +from qai_hub_models.models.whisper_asr.demo import main as demo_main +from qai_hub_models.models.whisper_asr.model import ( + MEL_FILTER_PATH, + Whisper, + WhisperDecoderInf, + WhisperEncoderInf, +) + + +@pytest.fixture(scope="session") +def mel_input() -> np.ndarray: + mel_filter_path = MEL_FILTER_PATH.fetch() + mel_filter = load_mel_filter(mel_filter_path) + audio_path = TEST_AUDIO_PATH.fetch() + return load_audio(mel_filter, audio_path) + + +def test_numerics(mel_input): + """ + Test that wrapper classes predict logits (without post processing) that + matches with the original model's. + """ + # OpenAI + with torch.no_grad(): + mel_input = torch.from_numpy(mel_input) + model = whisper.load_model("tiny.en") + audio_features = model.encoder(mel_input) + + tokens = torch.LongTensor([[50257]]) + logits_orig = model.decoder(tokens, audio_features).detach().numpy() + + # QAIHM + encoder = WhisperEncoderInf(model) + decoder = WhisperDecoderInf(model.decoder) + + cross_attn_cache = encoder(mel_input) + cache_tensor = np.array([], dtype=np.float32).reshape((1, 0, 384)) + self_attn_cache = [torch.from_numpy(cache_tensor)] * 2 * 4 + + decoder_out = decoder(tokens, *cross_attn_cache, *self_attn_cache) + logits = decoder_out[0].detach().numpy() + + np.testing.assert_allclose(logits_orig, logits) + + +def test_transcribe(mel_input): + """ + Test that pytorch wrappers produces end to end transcription results that + matches with the original model + """ + # Run inference with OpenAI whisper + with torch.no_grad(): + model = whisper.load_model("tiny.en") + options = whisper.DecodingOptions( + language="en", without_timestamps=False, fp16=False + ) + results = model.decode(torch.from_numpy(mel_input).float(), options) + text_orig = results[0].text + + app = WhisperApp(Whisper.from_source_model(model)) + + # Perform transcription + transcription = app.transcribe(mel_input) + assert transcription == text_orig + + +def test_demo(): + demo_main() diff --git a/qai_hub_models/models/wideresnet50/README.md b/qai_hub_models/models/wideresnet50/README.md new file mode 100644 index 00000000..a723b252 --- /dev/null +++ b/qai_hub_models/models/wideresnet50/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [WideResNet50: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/wideresnet50) + +WideResNet50 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of WideResNet50 found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/wideresnet50). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.wideresnet50.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.wideresnet50.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of WideResNet50 can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Wide Residual Networks](https://arxiv.org/abs/1605.07146) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) diff --git a/qai_hub_models/models/wideresnet50/__init__.py b/qai_hub_models/models/wideresnet50/__init__.py new file mode 100644 index 00000000..52956d36 --- /dev/null +++ b/qai_hub_models/models/wideresnet50/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import WideResNet50 as Model # noqa: F401 diff --git a/qai_hub_models/models/wideresnet50/demo.py b/qai_hub_models/models/wideresnet50/demo.py new file mode 100644 index 00000000..f840352e --- /dev/null +++ b/qai_hub_models/models/wideresnet50/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.wideresnet50.model import WideResNet50 + + +def main(is_test: bool = False): + imagenet_demo(WideResNet50, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/wideresnet50/export.py b/qai_hub_models/models/wideresnet50/export.py new file mode 100644 index 00000000..9fe2893a --- /dev/null +++ b/qai_hub_models/models/wideresnet50/export.py @@ -0,0 +1,185 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.wideresnet50 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "wideresnet50" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "wideresnet50", + "WideResNet50", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/wideresnet50/info.yaml b/qai_hub_models/models/wideresnet50/info.yaml new file mode 100644 index 00000000..6a3439a6 --- /dev/null +++ b/qai_hub_models/models/wideresnet50/info.yaml @@ -0,0 +1,38 @@ +name: WideResNet50 +# id must match with the model dir name in qai_hub_models +id: wideresnet50 +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: WideResNet50 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone +research_paper: https://arxiv.org/abs/1605.07146 +research_paper_title: Wide Residual Networks +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Number of parameters: 68.9M + Model size: 132 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: no +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/wideresnet50/model.py b/qai_hub_models/models/wideresnet50/model.py new file mode 100644 index 00000000..c5bc0c8f --- /dev/null +++ b/qai_hub_models/models/wideresnet50/model.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torchvision.models as tv_models + +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier + +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "IMAGENET1K_V1" + + +class WideResNet50(ImagenetClassifier): + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + net = tv_models.wide_resnet50_2(weights=weights) + return cls(net) diff --git a/qai_hub_models/models/wideresnet50/perf.yaml b/qai_hub_models/models/wideresnet50/perf.yaml new file mode 100644 index 00000000..9c9625ba --- /dev/null +++ b/qai_hub_models/models/wideresnet50/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: WideResNet50 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 4393.0 + throughput: 227.6348736626451 + estimated_peak_memory_range: + min: 24576 + max: 1816072 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 77 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 77 + job_id: jz57el9rp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 4605.0 + throughput: 217.15526601520088 + estimated_peak_memory_range: + min: 0 + max: 313348064 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 125 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 125 + job_id: jqp4yd3lp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:11:06.129828Z' diff --git a/qai_hub_models/models/wideresnet50/test.py b/qai_hub_models/models/wideresnet50/test.py new file mode 100644 index 00000000..7bc48ffb --- /dev/null +++ b/qai_hub_models/models/wideresnet50/test.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.wideresnet50.demo import main as demo_main +from qai_hub_models.models.wideresnet50.model import MODEL_ID, WideResNet50 + + +def test_task(): + run_imagenet_classifier_test(WideResNet50.from_pretrained(), MODEL_ID) + + +def test_trace(): + run_imagenet_classifier_trace_test(WideResNet50.from_pretrained()) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/wideresnet50_quantized/README.md b/qai_hub_models/models/wideresnet50_quantized/README.md new file mode 100644 index 00000000..a5ac6c61 --- /dev/null +++ b/qai_hub_models/models/wideresnet50_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [WideResNet50-Quantized: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/wideresnet50_quantized) + +WideResNet50 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of WideResNet50-Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/wideresnet50_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.wideresnet50_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.wideresnet50_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of WideResNet50-Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). + + +## References +* [Wide Residual Networks](https://arxiv.org/abs/1605.07146) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) diff --git a/qai_hub_models/models/wideresnet50_quantized/__init__.py b/qai_hub_models/models/wideresnet50_quantized/__init__.py new file mode 100644 index 00000000..e14b86ea --- /dev/null +++ b/qai_hub_models/models/wideresnet50_quantized/__init__.py @@ -0,0 +1,7 @@ +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) +from qai_hub_models.models.wideresnet50_quantized.model import MODEL_ID # noqa: F401 +from qai_hub_models.models.wideresnet50_quantized.model import ( # noqa: F401 + WideResNet50Quantizable as Model, +) diff --git a/qai_hub_models/models/wideresnet50_quantized/demo.py b/qai_hub_models/models/wideresnet50_quantized/demo.py new file mode 100644 index 00000000..d2ecb39c --- /dev/null +++ b/qai_hub_models/models/wideresnet50_quantized/demo.py @@ -0,0 +1,10 @@ +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.wideresnet50_quantized.model import WideResNet50Quantizable + + +def main(is_test: bool = False): + imagenet_demo(WideResNet50Quantizable, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/wideresnet50_quantized/export.py b/qai_hub_models/models/wideresnet50_quantized/export.py new file mode 100644 index 00000000..42c6373f --- /dev/null +++ b/qai_hub_models/models/wideresnet50_quantized/export.py @@ -0,0 +1,195 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.wideresnet50_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "wideresnet50_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "wideresnet50_quantized", + "WideResNet50-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics(inference_job, inference_result, torch_out) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/wideresnet50_quantized/info.yaml b/qai_hub_models/models/wideresnet50_quantized/info.yaml new file mode 100644 index 00000000..8d713243 --- /dev/null +++ b/qai_hub_models/models/wideresnet50_quantized/info.yaml @@ -0,0 +1,39 @@ +name: WideResNet50-Quantized +# id must match with the model dir name in qai_hub_models +id: wideresnet50_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: WideResNet50 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +use_case: Image Classification +tags: + - backbone + - quantized +research_paper: https://arxiv.org/abs/1605.07146 +research_paper_title: Wide Residual Networks +license: https://github.com/pytorch/vision/blob/main/LICENSE +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Number of parameters: 68.9M + Model size: 132 MB + Model checkpoint: Imagenet + Input resolution: 224x224 +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: no +license_type: bsd-3-clause +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/wideresnet50_quantized/model.py b/qai_hub_models/models/wideresnet50_quantized/model.py new file mode 100644 index 00000000..ec276a0c --- /dev/null +++ b/qai_hub_models/models/wideresnet50_quantized/model.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.wideresnet50.model import WideResNet50 +from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "wideresnet50_quantized_encodings.json" + + +class WideResNet50Quantizable(AIMETQuantizableMixin, WideResNet50): + """WideResNet50 with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + sim_model: QuantizationSimModel, + ) -> None: + WideResNet50.__init__(self, sim_model.model) + AIMETQuantizableMixin.__init__( + self, sim_model, needs_onnx_direct_aimet_export=True + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "WideResNet50Quantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + model = WideResNet50.from_pretrained() + input_shape = model.get_input_spec()["image_tensor"][0] + + equalize_model(model, input_shape) + sim = QuantizationSimModel( + model.net, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_per_channel_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) diff --git a/qai_hub_models/models/wideresnet50_quantized/perf.yaml b/qai_hub_models/models/wideresnet50_quantized/perf.yaml new file mode 100644 index 00000000..b9121909 --- /dev/null +++ b/qai_hub_models/models/wideresnet50_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: WideResNet50-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1833.0 + throughput: 545.5537370430987 + estimated_peak_memory_range: + min: 28672 + max: 1710680 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 80 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 80 + job_id: jz5wl34jp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1756.0 + throughput: 569.4760820045558 + estimated_peak_memory_range: + min: 520192 + max: 152789048 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 78 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 78 + job_id: jmg9zydvp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:11:48.964511Z' diff --git a/qai_hub_models/models/wideresnet50_quantized/test.py b/qai_hub_models/models/wideresnet50_quantized/test.py new file mode 100644 index 00000000..de7ee662 --- /dev/null +++ b/qai_hub_models/models/wideresnet50_quantized/test.py @@ -0,0 +1,37 @@ +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, + run_imagenet_classifier_trace_test, +) +from qai_hub_models.models.wideresnet50_quantized.demo import main as demo_main +from qai_hub_models.models.wideresnet50_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + WideResNet50Quantizable, +) + + +def test_task(): + run_imagenet_classifier_test( + WideResNet50Quantizable.from_pretrained(), + MODEL_ID, + probability_threshold=0.4, + asset_version=MODEL_ASSET_VERSION, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +def test_trace(): + run_imagenet_classifier_trace_test( + WideResNet50Quantizable.from_pretrained(), + diff_tol=0.01, + rtol=0.02, + atol=0.2, + is_quantized=True, + ) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/xlsr/README.md b/qai_hub_models/models/xlsr/README.md new file mode 100644 index 00000000..742df1d9 --- /dev/null +++ b/qai_hub_models/models/xlsr/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [XLSR: Upscale images in real time](https://aihub.qualcomm.com/models/xlsr) + +XLSR is designed for lightweight real-time upscaling of images. + +This is based on the implementation of XLSR found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/xlsr). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/xlsr). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.xlsr.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.xlsr.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of XLSR can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). + + +## References +* [Extremely Lightweight Quantization Robust Real-Time Single-Image Super Resolution for Mobile Devices](https://arxiv.org/abs/2105.10288) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/xlsr) diff --git a/qai_hub_models/models/xlsr/__init__.py b/qai_hub_models/models/xlsr/__init__.py new file mode 100644 index 00000000..37f48d00 --- /dev/null +++ b/qai_hub_models/models/xlsr/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import XLSR as Model # noqa: F401 diff --git a/qai_hub_models/models/xlsr/demo.py b/qai_hub_models/models/xlsr/demo.py new file mode 100644 index 00000000..366235ef --- /dev/null +++ b/qai_hub_models/models/xlsr/demo.py @@ -0,0 +1,15 @@ +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.xlsr.model import MODEL_ASSET_VERSION, MODEL_ID, XLSR +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "xlsr_demo.jpg" +) + + +def main(is_test: bool = False): + super_resolution_demo(XLSR, IMAGE_ADDRESS, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/xlsr/export.py b/qai_hub_models/models/xlsr/export.py new file mode 100644 index 00000000..211e7a87 --- /dev/null +++ b/qai_hub_models/models/xlsr/export.py @@ -0,0 +1,190 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.xlsr import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "xlsr" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "xlsr", + "XLSR", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/xlsr/info.yaml b/qai_hub_models/models/xlsr/info.yaml new file mode 100644 index 00000000..de5111ab --- /dev/null +++ b/qai_hub_models/models/xlsr/info.yaml @@ -0,0 +1,31 @@ +name: XLSR +# id must match with the model dir name in qai_hub_models +id: xlsr +status: public +headline: Upscale images in real time. +domain: Computer Vision +use_case: Super Resolution +description: XLSR is designed for lightweight real-time upscaling of images. +tags: [] +research_paper: https://arxiv.org/abs/2105.10288 +research_paper_title: Extremely Lightweight Quantization Robust Real-Time Single-Image + Super Resolution for Mobile Devices +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/xlsr +technical_details: + Number of parameters: 28.0K + Model size: 117 KB + Model checkpoint: xlsr_4x_checkpoint_float32 + Input resolution: 128x128 +applicable_scenarios: +- Virtual Real Estate Tours +- Gaming +- ARVR +form_factors: +- Phone +- Tablet +related_models: [esrgan, real_esrgan_general_x4v3] +has_static_banner: yes +has_animated_banner: yes +license_type: other +dataset: [] diff --git a/qai_hub_models/models/xlsr/model.py b/qai_hub_models/models/xlsr/model.py new file mode 100644 index 00000000..2fe1fb80 --- /dev/null +++ b/qai_hub_models/models/xlsr/model.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.evaluators.superres_evaluator import SuperResolutionOutputEvaluator +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, SourceAsRoot +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/xlsr/model/model_cards/xlsr_4x_w8a8.json +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_february_artifacts/xlsr_4x_checkpoint_float32.pth.tar +XLSR_WEIGHTS = "xlsr_4x_checkpoint_float32.pth.tar" +XLSR_SOURCE_REPOSITORY = "https://github.com/quic/aimet-model-zoo" +XLSR_SOURCE_REPO_COMMIT = "d09d2b0404d10f71a7640a87e9d5e5257b028802" +SCALING_FACTOR = 4 + + +class XLSR(BaseModel): + """Exportable XLSR super resolution model, end-to-end.""" + + def __init__( + self, + xlsr_model: torch.nn.Module, + ) -> None: + super().__init__() + self.model = xlsr_model + + @classmethod + def from_pretrained(cls) -> XLSR: + model = _load_xlsr_source_model() + dst = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, XLSR_WEIGHTS + ).fetch() + checkpoint = torch.load(dst, map_location=torch.device("cpu")) + model.load_state_dict(checkpoint["state_dict"]) + model.eval() + + return cls(model) + + def get_evaluator(self) -> BaseEvaluator: + return SuperResolutionOutputEvaluator() + + def forward(self, image: torch.Tensor) -> torch.Tensor: + """ + Run XLSR on `image`, and produce an upscaled image + + Parameters: + image: Pixel values pre-processed for model consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + image: Pixel values + Range: float[0, 1] + 3-channel Color Space: RGB + """ + return self.model(image) + + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + height: int = 128, + width: int = 128, + ) -> InputSpec: + # Get the input specification ordered (name -> (shape, type)) pairs for this model. + # + # This can be used with the qai_hub python API to declare + # the model input specification upon submitting a profile job. + return {"image": ((batch_size, num_channels, height, width), "float32")} + + +def _load_xlsr_source_model() -> torch.nn.Module: + # Load XLSR model from the source repository using the given weights. + # Returns .utils.super_resolution.models.XLSRRelease + with SourceAsRoot( + XLSR_SOURCE_REPOSITORY, XLSR_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION + ): + # necessary import. `modeling.deeplab` comes from the XLSR repo. + from aimet_zoo_torch.common.super_resolution.models import XLSRRelease + + return XLSRRelease(scaling_factor=SCALING_FACTOR) diff --git a/qai_hub_models/models/xlsr/perf.yaml b/qai_hub_models/models/xlsr/perf.yaml new file mode 100644 index 00000000..91c2f707 --- /dev/null +++ b/qai_hub_models/models/xlsr/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: XLSR + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 2523.0 + throughput: 396.3535473642489 + estimated_peak_memory_range: + min: 24576 + max: 1686120 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 13 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 16 + job_id: jogk2qlyg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1068.0 + throughput: 936.3295880149813 + estimated_peak_memory_range: + min: 217088 + max: 63076024 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 22 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 22 + job_id: jn5qlr77p + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:37:57.776098Z' diff --git a/qai_hub_models/models/xlsr/test.py b/qai_hub_models/models/xlsr/test.py new file mode 100644 index 00000000..e45a5a90 --- /dev/null +++ b/qai_hub_models/models/xlsr/test.py @@ -0,0 +1,34 @@ +import numpy as np + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.xlsr.demo import IMAGE_ADDRESS +from qai_hub_models.models.xlsr.demo import main as demo_main +from qai_hub_models.models.xlsr.model import MODEL_ASSET_VERSION, MODEL_ID, XLSR +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_LOCAL_PATH = "xlsr_output.png" +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, OUTPUT_IMAGE_LOCAL_PATH +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + model = XLSR.from_pretrained() + app = SuperResolutionApp(model=model) + app_output_image = app.upscale_image(image)[0] + + np.testing.assert_allclose( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/xlsr_quantized/README.md b/qai_hub_models/models/xlsr_quantized/README.md new file mode 100644 index 00000000..edf7ff13 --- /dev/null +++ b/qai_hub_models/models/xlsr_quantized/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [XLSR-Quantized: Upscale images in real time](https://aihub.qualcomm.com/models/xlsr_quantized) + +XLSR is designed for lightweight real-time upscaling of images. + +This is based on the implementation of XLSR-Quantized found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/xlsr). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/xlsr_quantized). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.xlsr_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.xlsr_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of XLSR-Quantized can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). + + +## References +* [Extremely Lightweight Quantization Robust Real-Time Single-Image Super Resolution for Mobile Devices](https://arxiv.org/abs/2105.10288) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/xlsr) diff --git a/qai_hub_models/models/xlsr_quantized/__init__.py b/qai_hub_models/models/xlsr_quantized/__init__.py new file mode 100644 index 00000000..a3f835a3 --- /dev/null +++ b/qai_hub_models/models/xlsr_quantized/__init__.py @@ -0,0 +1,6 @@ +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import XLSRQuantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/xlsr_quantized/demo.py b/qai_hub_models/models/xlsr_quantized/demo.py new file mode 100644 index 00000000..28a382b3 --- /dev/null +++ b/qai_hub_models/models/xlsr_quantized/demo.py @@ -0,0 +1,19 @@ +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.xlsr_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + XLSRQuantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "xlsr_quantized_demo.jpg" +) + + +def main(is_test: bool = False): + super_resolution_demo(XLSRQuantizable, IMAGE_ADDRESS, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/xlsr_quantized/export.py b/qai_hub_models/models/xlsr_quantized/export.py new file mode 100644 index 00000000..a7391248 --- /dev/null +++ b/qai_hub_models/models/xlsr_quantized/export.py @@ -0,0 +1,200 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub + +from qai_hub_models.models.xlsr_quantized import Model +from qai_hub_models.utils.args import ( + TargetRuntime, + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "xlsr_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "xlsr_quantized", + "XLSR-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/xlsr_quantized/info.yaml b/qai_hub_models/models/xlsr_quantized/info.yaml new file mode 100644 index 00000000..dd4dd4cb --- /dev/null +++ b/qai_hub_models/models/xlsr_quantized/info.yaml @@ -0,0 +1,31 @@ +name: XLSR-Quantized +# id must match with the model dir name in qai_hub_models +id: xlsr_quantized +status: public +headline: Upscale images in real time. +domain: Computer Vision +use_case: Super Resolution +description: XLSR is designed for lightweight real-time upscaling of images. +tags: + - quantized +research_paper: https://arxiv.org/abs/2105.10288 +research_paper_title: Extremely Lightweight Quantization Robust Real-Time Single-Image Super Resolution for Mobile Devices +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/xlsr +technical_details: + Number of parameters: 28K + Model size: 195 KB + Model checkpoint: xlsr_4x_checkpoint_w8a8 + Input resolution: 128x128 +applicable_scenarios: + - Virtual Real Estate Tours + - Gaming + - ARVR +form_factors: + - Phone + - Tablet +related_models: ['esrgan', 'real_esrgan_general_x4v3', 'xlsr'] +has_static_banner: yes +has_animated_banner: yes +license_type: other +dataset: [] diff --git a/qai_hub_models/models/xlsr_quantized/model.py b/qai_hub_models/models/xlsr_quantized/model.py new file mode 100644 index 00000000..c716d19a --- /dev/null +++ b/qai_hub_models/models/xlsr_quantized/model.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +import torch +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.xlsr.model import XLSR, _load_xlsr_source_model +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( # isort: skip + AIMETQuantizableMixin, +) + + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/xlsr/model/model_cards/xlsr_4x_w8a8.json: +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_february_artifacts/xlsr_4x_checkpoint_int8.pth +# and +# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js +# Encodings were generated with AIMET QuantSim library +XLSR_QUANTIZED_WEIGHTS = "xlsr_4x_checkpoint_int8.pth" +AIMET_ENCODINGS = "aimet_quantization_encodings.json" +AIMET_CONFIG = "default_config_per_channel.json" +SCALING_FACTOR = 4 + + +class XLSRQuantizable(AIMETQuantizableMixin, XLSR): + """XLSR with post training quantization suport + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + xlsr_model: QuantizationSimModel, + ) -> None: + XLSR.__init__(self, xlsr_model.model) + AIMETQuantizableMixin.__init__( + self, xlsr_model, needs_onnx_direct_aimet_export=True + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> XLSRQuantizable: + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on BSD300. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + xlsr = _load_xlsr_source_model() + input_shape = XLSR.get_input_spec()["image"][0] + + weights = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, XLSR_QUANTIZED_WEIGHTS + ).fetch() + aimet_config = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, AIMET_CONFIG + ).fetch() + + # Load the model weights and quantization parameters + state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"] + xlsr.load_state_dict(state_dict) + sim = QuantizationSimModel( + xlsr, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=aimet_config, + dummy_input=torch.rand(input_shape), + ) + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + return cls(sim) diff --git a/qai_hub_models/models/xlsr_quantized/perf.yaml b/qai_hub_models/models/xlsr_quantized/perf.yaml new file mode 100644 index 00000000..e76f30ab --- /dev/null +++ b/qai_hub_models/models/xlsr_quantized/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: XLSR-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1298.0 + throughput: 770.4160246533128 + estimated_peak_memory_range: + min: 24576 + max: 1426056 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 16 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 19 + job_id: jo5m064yg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:31:32.010687Z' diff --git a/qai_hub_models/models/xlsr_quantized/test.py b/qai_hub_models/models/xlsr_quantized/test.py new file mode 100644 index 00000000..ef3ef0e3 --- /dev/null +++ b/qai_hub_models/models/xlsr_quantized/test.py @@ -0,0 +1,41 @@ +import numpy as np +import torch + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.xlsr_quantized.demo import IMAGE_ADDRESS +from qai_hub_models.models.xlsr_quantized.demo import main as demo_main +from qai_hub_models.models.xlsr_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + XLSRQuantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_LOCAL_PATH = "xlsr_quantized_output.png" +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, OUTPUT_IMAGE_LOCAL_PATH +) + + +@skip_clone_repo_check +def test_task(): + # AIMET Quantization Simulator introduces randomness. Eliminate that for this test. + torch.manual_seed(0) + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + model = XLSRQuantizable.from_pretrained() + app = SuperResolutionApp(model=model) + app_output_image = app.upscale_image(image)[0] + + np.testing.assert_allclose( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + rtol=0.02, + atol=0.2, + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/yolov6/README.md b/qai_hub_models/models/yolov6/README.md new file mode 100644 index 00000000..fa194543 --- /dev/null +++ b/qai_hub_models/models/yolov6/README.md @@ -0,0 +1,50 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Yolo-v6: Real-time object detection optimized for mobile and edge](https://aihub.qualcomm.com/models/yolov6) + +YoloV6 is a machine learning model that predicts bounding boxes and classes of objects in an image. + +This is based on the implementation of Yolo-v6 found +[here](https://github.com/meituan/YOLOv6/). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolov6). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.yolov6.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.yolov6.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Yolo-v6 can be found + [here](https://github.com/meituan/YOLOv6/blob/47625514e7480706a46ff3c0cd0252907ac12f22/LICENSE). + + +## References +* [YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications](https://arxiv.org/abs/2209.02976) +* [Source Model Implementation](https://github.com/meituan/YOLOv6/) diff --git a/qai_hub_models/models/yolov6/__init__.py b/qai_hub_models/models/yolov6/__init__.py new file mode 100644 index 00000000..7742bd1f --- /dev/null +++ b/qai_hub_models/models/yolov6/__init__.py @@ -0,0 +1,4 @@ +from qai_hub_models.models.yolov6.app import YoloV6DetectionApp as App # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import YoloV6 as Model # noqa: F401 diff --git a/qai_hub_models/models/yolov6/app.py b/qai_hub_models/models/yolov6/app.py new file mode 100644 index 00000000..49cff8f2 --- /dev/null +++ b/qai_hub_models/models/yolov6/app.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.models._shared.yolo.app import YoloObjectDetectionApp +from qai_hub_models.models.yolov6.model import YoloV6 + + +class YoloV6DetectionApp(YoloObjectDetectionApp): + def check_image_size(self, pixel_values: torch.Tensor) -> None: + """ + Verify image size is valid model input. + """ + if len(pixel_values.shape) != 4: + raise ValueError("Pixel Values must be rank 4: [batch, channels, x, y]") + if ( + pixel_values.shape[2] % YoloV6.STRIDE_MULTIPLE != 0 + or pixel_values.shape[3] % YoloV6.STRIDE_MULTIPLE != 0 + ): + raise ValueError( + f"Pixel values must have spatial dimensions (H & W) that are multiples of {YoloV6.STRIDE_MULTIPLE}." + ) diff --git a/qai_hub_models/models/yolov6/demo.py b/qai_hub_models/models/yolov6/demo.py new file mode 100644 index 00000000..e532ee7b --- /dev/null +++ b/qai_hub_models/models/yolov6/demo.py @@ -0,0 +1,25 @@ +from qai_hub_models.models._shared.yolo.demo import yolo_detection_demo +from qai_hub_models.models.yolov6.app import YoloV6DetectionApp +from qai_hub_models.models.yolov6.model import MODEL_ASSET_VERSION, MODEL_ID, YoloV6 +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +WEIGHTS_HELP_MSG = ( + "YoloV6 checkpoint name, defined here: https://github.com/meituan/YOLOv6/releases" +) +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/input_image.jpg" +) + + +def main(is_test: bool = False): + yolo_detection_demo( + YoloV6, + YoloV6DetectionApp, + IMAGE_ADDRESS, + YoloV6.STRIDE_MULTIPLE, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolov6/export.py b/qai_hub_models/models/yolov6/export.py new file mode 100644 index 00000000..5fc7e499 --- /dev/null +++ b/qai_hub_models/models/yolov6/export.py @@ -0,0 +1,187 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.yolov6 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "yolov6" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "yolov6", + "Yolo-v6", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics( + inference_job, inference_result, torch_out, outputs_to_skip=[2] + ) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolov6/info.yaml b/qai_hub_models/models/yolov6/info.yaml new file mode 100644 index 00000000..2bc3e86e --- /dev/null +++ b/qai_hub_models/models/yolov6/info.yaml @@ -0,0 +1,35 @@ +name: Yolo-v6 +# id must match with the model dir name in qai_hub_models +id: yolov6 +status: public +headline: Real-time object detection optimized for mobile and edge. +domain: Computer Vision +description: YoloV6 is a machine learning model that predicts bounding boxes and classes of objects in an image. +use_case: Object Detection +tags: + - real-time +research_paper: https://arxiv.org/abs/2209.02976 +research_paper_title: "YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications" +license: https://github.com/meituan/YOLOv6/blob/47625514e7480706a46ff3c0cd0252907ac12f22/LICENSE +source_repo: https://github.com/meituan/YOLOv6/ +technical_details: + Number of parameters: 5M + Model size: 20 MB + Model checkpoint: YoloV6-N + Input resolution: 640x640 +applicable_scenarios: + - Factory Automation + - Robotic Navigation + - Camera +related_models: + - 'yolov7' + - 'yolov8_det' +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: gpl-3.0 +dataset: [] diff --git a/qai_hub_models/models/yolov6/model.py b/qai_hub_models/models/yolov6/model.py new file mode 100644 index 00000000..47a9776e --- /dev/null +++ b/qai_hub_models/models/yolov6/model.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import tempfile + +import torch +import torch.nn as nn + +from qai_hub_models.models._shared.yolo.utils import detect_postprocess +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + SourceAsRoot, + load_path, +) +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +YOLOV6_SOURCE_REPOSITORY = "https://github.com/meituan/YOLOv6" +YOLOV6_SOURCE_REPO_COMMIT = "55d80c317edd0fb5847e599a1802d394f34a3141" +MODEL_ASSET_VERSION = 1 +MODEL_ID = __name__.split(".")[-2] + +WEIGHTS_PATH = "https://github.com/meituan/YOLOv6/releases/download/0.4.0/" +DEFAULT_WEIGHTS = "yolov6n.pt" + + +class YoloV6(BaseModel): + """Exportable YoloV6 bounding box detector, end-to-end.""" + + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + # All image input spatial dimensions should be a multiple of this stride. + STRIDE_MULTIPLE = 32 + + @classmethod + def from_pretrained(cls, ckpt_name: str = DEFAULT_WEIGHTS): + model_url = f"{WEIGHTS_PATH}{ckpt_name}" + asset = CachedWebModelAsset(model_url, MODEL_ID, MODEL_ASSET_VERSION, ckpt_name) + model = _load_yolov6_source_model_from_weights(asset) + return cls(model) + + def forward(self, image: torch.Tensor): + """ + Run YoloV6 on `image`, and produce a predicted set of bounding boxes and associated class probabilities. + + Parameters: + image: Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + boxes: Shape [batch, num preds, 4] where 4 == (center_x, center_y, w, h) + class scores multiplied by confidence: Shape [batch, num_preds, # of classes (typically 80)] + """ + predictions = self.model(image) + return detect_postprocess(predictions) + + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + height: int = 640, + width: int = 640, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, num_channels, height, width), "float32")} + + +def _load_yolov6_source_model_from_weights( + ckpt_path: str | CachedWebModelAsset, +) -> torch.nn.Module: + with tempfile.TemporaryDirectory() as tmpdir: + model_path = load_path(ckpt_path, tmpdir) + with SourceAsRoot( + YOLOV6_SOURCE_REPOSITORY, + YOLOV6_SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + ): + from yolov6.layers.common import RepVGGBlock + from yolov6.utils.checkpoint import load_checkpoint + + model = load_checkpoint( + model_path, map_location="cpu", inplace=True, fuse=True + ) + model.export = True + + for layer in model.modules(): + if isinstance(layer, RepVGGBlock): + layer.switch_to_deploy() + elif isinstance(layer, nn.Upsample) and not hasattr( + layer, "recompute_scale_factor" + ): + layer.recompute_scale_factor = None # torch 1.11.0 compatibility + return model diff --git a/qai_hub_models/models/yolov6/perf.yaml b/qai_hub_models/models/yolov6/perf.yaml new file mode 100644 index 00000000..82e39b9b --- /dev/null +++ b/qai_hub_models/models/yolov6/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Yolo-v6 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 7848.0 + throughput: 127.420998980632 + estimated_peak_memory_range: + min: 32768 + max: 7233136 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 182 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 182 + job_id: jqpyoj4r5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 7283.0 + throughput: 137.3060551970342 + estimated_peak_memory_range: + min: 4931584 + max: 17461520 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 230 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 230 + job_id: j2p0m212g + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:12:26.065342Z' diff --git a/qai_hub_models/models/yolov6/test.py b/qai_hub_models/models/yolov6/test.py new file mode 100644 index 00000000..c45b9de9 --- /dev/null +++ b/qai_hub_models/models/yolov6/test.py @@ -0,0 +1,46 @@ +import numpy as np +import torch + +from qai_hub_models.models._shared.yolo.utils import detect_postprocess +from qai_hub_models.models.yolov6.demo import IMAGE_ADDRESS +from qai_hub_models.models.yolov6.demo import main as demo_main +from qai_hub_models.models.yolov6.model import ( + DEFAULT_WEIGHTS, + MODEL_ASSET_VERSION, + MODEL_ID, + WEIGHTS_PATH, + YoloV6, + _load_yolov6_source_model_from_weights, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.image_processing import preprocess_PIL_image +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_task(): + model_path = f"{WEIGHTS_PATH}{DEFAULT_WEIGHTS}" + asset = CachedWebModelAsset( + model_path, MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_WEIGHTS + ) + + # source model + source_model = _load_yolov6_source_model_from_weights(asset) + + # Qualcomm AI Hub Model + qaihm_model = YoloV6.from_pretrained() + + with torch.no_grad(): + # source model output + processed_sample_image = preprocess_PIL_image(load_image(IMAGE_ADDRESS)) + source_detect_out = source_model(processed_sample_image) + source_out_postprocessed = detect_postprocess(source_detect_out) + + # Qualcomm AI Hub Model output + qaihm_out_postprocessed = qaihm_model(processed_sample_image) + for i in range(0, len(source_out_postprocessed)): + assert np.allclose(source_out_postprocessed[i], qaihm_out_postprocessed[i]) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/yolov7/README.md b/qai_hub_models/models/yolov7/README.md new file mode 100644 index 00000000..bb98d698 --- /dev/null +++ b/qai_hub_models/models/yolov7/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Yolo-v7: Real-time object detection optimized for mobile and edge](https://aihub.qualcomm.com/models/yolov7) + +YoloV7 is a machine learning model that predicts bounding boxes and classes of objects in an image. + +This is based on the implementation of Yolo-v7 found +[here](https://github.com/WongKinYiu/yolov7/). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolov7). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[yolov7]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.yolov7.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.yolov7.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Yolo-v7 can be found + [here](https://github.com/WongKinYiu/yolov7/blob/main/LICENSE.md). + + +## References +* [YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors](https://arxiv.org/abs/2207.02696) +* [Source Model Implementation](https://github.com/WongKinYiu/yolov7/) diff --git a/qai_hub_models/models/yolov7/__init__.py b/qai_hub_models/models/yolov7/__init__.py new file mode 100644 index 00000000..f340dc77 --- /dev/null +++ b/qai_hub_models/models/yolov7/__init__.py @@ -0,0 +1,4 @@ +from qai_hub_models.models.yolov7.app import YoloV7DetectionApp as App # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import YoloV7 as Model # noqa: F401 diff --git a/qai_hub_models/models/yolov7/app.py b/qai_hub_models/models/yolov7/app.py new file mode 100644 index 00000000..4d19ad93 --- /dev/null +++ b/qai_hub_models/models/yolov7/app.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.models._shared.yolo.app import YoloObjectDetectionApp +from qai_hub_models.models.yolov7.model import YoloV7 + + +class YoloV7DetectionApp(YoloObjectDetectionApp): + def check_image_size(self, pixel_values: torch.Tensor) -> None: + """ + Verify image size is valid model input. + """ + if len(pixel_values.shape) != 4: + raise ValueError("Pixel Values must be rank 4: [batch, channels, x, y]") + if ( + pixel_values.shape[2] % YoloV7.STRIDE_MULTIPLE != 0 + or pixel_values.shape[3] % YoloV7.STRIDE_MULTIPLE != 0 + ): + raise ValueError( + f"Pixel values must have spatial dimensions (H & W) that are multiples of {YoloV7.STRIDE_MULTIPLE}." + ) diff --git a/qai_hub_models/models/yolov7/demo.py b/qai_hub_models/models/yolov7/demo.py new file mode 100644 index 00000000..3c039dc3 --- /dev/null +++ b/qai_hub_models/models/yolov7/demo.py @@ -0,0 +1,22 @@ +from qai_hub_models.models._shared.yolo.demo import yolo_detection_demo +from qai_hub_models.models.yolov7.app import YoloV7DetectionApp +from qai_hub_models.models.yolov7.model import MODEL_ASSET_VERSION, MODEL_ID, YoloV7 +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "yolov7_demo_640.jpg" +) + + +def main(is_test: bool = False): + yolo_detection_demo( + YoloV7, + YoloV7DetectionApp, + IMAGE_ADDRESS, + YoloV7.STRIDE_MULTIPLE, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolov7/export.py b/qai_hub_models/models/yolov7/export.py new file mode 100644 index 00000000..1571baa9 --- /dev/null +++ b/qai_hub_models/models/yolov7/export.py @@ -0,0 +1,187 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.yolov7 import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "yolov7" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "yolov7", + "Yolo-v7", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics( + inference_job, inference_result, torch_out, outputs_to_skip=[2] + ) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolov7/info.yaml b/qai_hub_models/models/yolov7/info.yaml new file mode 100644 index 00000000..2113cd26 --- /dev/null +++ b/qai_hub_models/models/yolov7/info.yaml @@ -0,0 +1,35 @@ +name: Yolo-v7 +# id must match with the model dir name in qai_hub_models +id: yolov7 +status: public +headline: Real-time object detection optimized for mobile and edge. +domain: Computer Vision +description: YoloV7 is a machine learning model that predicts bounding boxes and classes of objects in an image. +use_case: Object Detection +tags: + - real-time +research_paper: https://arxiv.org/abs/2207.02696 +research_paper_title: "YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors" +license: https://github.com/WongKinYiu/yolov7/blob/main/LICENSE.md +source_repo: https://github.com/WongKinYiu/yolov7/ +technical_details: + Number of parameters: 6.2M + Model size: 25 MB + Model checkpoint: YoloV7 Tiny + Input resolution: 720p (720x1280) +applicable_scenarios: + - Factory Automation + - Robotic Navigation + - Camera +related_models: + - 'yolov6' + - 'yolov8_det' +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: gpl-3.0 +dataset: [] diff --git a/qai_hub_models/models/yolov7/model.py b/qai_hub_models/models/yolov7/model.py new file mode 100644 index 00000000..8e79b26c --- /dev/null +++ b/qai_hub_models/models/yolov7/model.py @@ -0,0 +1,233 @@ +from __future__ import annotations + +from typing import Any, List, Mapping, Optional + +import torch + +from qai_hub_models.models._shared.yolo.utils import ( + detect_postprocess, + yolo_sample_inputs, +) +from qai_hub_models.utils.asset_loaders import SourceAsRoot +from qai_hub_models.utils.base_model import BaseModel, InputsType +from qai_hub_models.utils.input_spec import InputSpec + +YOLOV7_SOURCE_REPOSITORY = "https://github.com/WongKinYiu/yolov7" +YOLOV7_SOURCE_REPO_COMMIT = "84932d70fb9e2932d0a70e4a1f02a1d6dd1dd6ca" +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "yolov7-tiny.pt" +MODEL_ASSET_VERSION = 1 + + +class YoloV7(BaseModel): + """Exportable YoloV7 bounding box detector, end-to-end.""" + + def __init__( + self, + yolov7_feature_extractor: torch.nn.Module, + yolov7_detector: torch.nn.Module, + ) -> None: + super().__init__() + self.yolov7_feature_extractor = yolov7_feature_extractor + self.yolov7_detector = yolov7_detector + + # All image input spatial dimensions should be a multiple of this stride. + STRIDE_MULTIPLE = 32 + + @classmethod + def from_pretrained( + cls, + weights_name: Optional[str] = DEFAULT_WEIGHTS, + ): + """Load YoloV7 from a weightfile created by the source YoloV7 repository.""" + # Load PyTorch model from disk + yolov7_model = _load_yolov7_source_model_from_weights(weights_name) + + yolov7_model.profile = False + + # When traced = True, the model will skip the "Detect" step, + # which allows us to override it with an exportable version. + yolov7_model.traced = True + + # Generate replacement detector that can be traced + detector_head_state_dict = yolov7_model.model[-1].state_dict() + detector_head_state_dict["stride"] = yolov7_model.model[-1].stride + detector_head_state_dict["f"] = yolov7_model.model[ + -1 + ].f # Previous (input) node indices in sequential model + detector_head_state_dict["i"] = yolov7_model.model[ + -1 + ].i # Index in sequential model + yolov7_detect = _YoloV7Detector.from_yolov7_state_dict(detector_head_state_dict) + + return cls( + yolov7_model, + yolov7_detect, + ) + + def forward(self, image: torch.Tensor): + """ + Run YoloV7 on `image`, and produce a predicted set of bounding boxes and associated class probabilities. + + Parameters: + image: Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: BGR + + Returns: + boxes: Shape [batch, num preds, 4] where 4 == (center_x, center_y, w, h) + class scores multiplied by confidence: Shape [batch, num_preds, # of classes (typically 80)] + """ + feature_extraction_output = ( + *self.yolov7_feature_extractor(image), + ) # Convert output list to Tuple, for exportability + prediction = self.yolov7_detector(feature_extraction_output) + return detect_postprocess(prediction) + + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + height: int = 640, + width: int = 640, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, num_channels, height, width), "float32")} + + def sample_inputs(self, input_spec: InputSpec | None = None) -> InputsType: + if input_spec is not None and input_spec != YoloV7.get_input_spec(): + raise ValueError("Sample input has a fixed size that cannot be changed") + + return yolo_sample_inputs() + + +class _YoloV7Detector(torch.nn.Module): # YoloV7 Detection + """Converts features extracted by YoloV7 to predicted bounding boxes & associated class predictions.""" + + def __init__( + self, + stride: torch.Tensor, + f, + i, + num_anchors: int, + num_layers: int, + m_in_channels: List[int], + m_out_channel, + ): + super(_YoloV7Detector, self).__init__() + self.f = f + self.i = i + self.stride = stride + self.na = num_anchors + self.no = m_out_channel // self.na # number of outputs per anchor + self.nc = self.no - 5 # number of classes + self.nl = num_layers + for i in range(0, self.nl): + self.register_buffer( + f"anchor_grid_{i}", torch.zeros(1, self.na, 1, 1, 2) + ) # nl * [ tensor(shape(1,na,1,1,2)) ] + self.m = torch.nn.ModuleList( + torch.nn.Conv2d(m_in_channel, m_out_channel, 1) + for m_in_channel in m_in_channels + ) # output conv + + @staticmethod + def from_yolov7_state_dict( + state_dict: Mapping[str, Any], + strict: bool = True, + ): + """ + Load this module from a state dict taken from the "Detect" module. + This module is found in the original YoloV7 source repository (models/common.py::Detect). + """ + new_state_dict = {} + + # Convert anchor grid buffer from rank 6 to several rank 5 tensors, for export-friendliness. + anchor_grid = state_dict["anchor_grid"] + nl = len(anchor_grid) + na = anchor_grid.shape[2] + for i in range(0, nl): + new_state_dict[f"anchor_grid_{i}"] = anchor_grid[i] + + # Copy over `m` layers + m_in_channels = [] + m_out_channel = 0 + for i in range(0, nl): + weight = f"m.{i}.weight" + for x in [weight, f"m.{i}.bias"]: + new_state_dict[x] = state_dict[x] + m_in_channels.append(new_state_dict[weight].shape[1]) + m_out_channel = new_state_dict[weight].shape[0] + + out = _YoloV7Detector( + state_dict["stride"], + state_dict["f"], + state_dict["i"], + na, + nl, + m_in_channels, + m_out_channel, + ) + out.load_state_dict(new_state_dict, strict) + return out + + def make_grid_points(self, x, i): + x = x.sigmoid() + bs, _, ny, nx = x.shape # x(bs,255,20,20) to x(bs,3,20,20,85) + x = x.view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous() + grid = self._make_grid(nx, ny) + y = x + xy = (y[..., 0:2] * 2.0 - 0.5 + grid) * self.stride[i] + wh = (y[..., 2:4] * 2) ** 2 * self.__getattr__(f"anchor_grid_{i}") + + cat = torch.cat((xy, wh, y[..., 4:]), -1) + return cat.view(bs, -1, self.no) + + def forward(self, all_x: tuple[torch.Tensor, ...]): + """ + From the outputs of the feature extraction layers of YoloV7, predict bounding boxes, + classes, and confidence. + + Parameters: + all_x: tuple[torch.Tensor] + Outputs of the feature extraction layers of YoloV7. Typically 3 5D tensors. + + Returns: + pred: [batch_size, # of predictions, 5 + # of classes] + Where the rightmost dim contains [center_x, center_y, w, h, confidence score, n per-class scores] + """ + z = [] # inference output + for i in range(self.nl): + x = all_x[i] + x = self.m[i](x) # conv + points = self.make_grid_points(x, i) + z.append(points) + + return torch.cat(z, 1) + + @staticmethod + def _make_grid(nx=20, ny=20): + yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)], indexing="ij") + return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float() + + +def _load_yolov7_source_model_from_weights(weights_name: str) -> torch.nn.Module: + # Load YoloV7 model from the source repository using the given weights. + # Returns .models.yolo.Model + with SourceAsRoot( + YOLOV7_SOURCE_REPOSITORY, + YOLOV7_SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + ): + # necessary imports. `models` and `utils` come from the yolov7 repo. + from models.experimental import attempt_load + from models.yolo import Model + + yolov7_model = attempt_load(weights_name, map_location="cpu") # load FP32 model + + assert isinstance(yolov7_model, Model) + return yolov7_model diff --git a/qai_hub_models/models/yolov7/perf.yaml b/qai_hub_models/models/yolov7/perf.yaml new file mode 100644 index 00000000..36dfb9a3 --- /dev/null +++ b/qai_hub_models/models/yolov7/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Yolo-v7 + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 22349.0 + throughput: 44.74473130788849 + estimated_peak_memory_range: + min: 9764864 + max: 12574848 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 286 + layers_on_gpu: 0 + layers_on_cpu: 21 + total_layers: 307 + job_id: jvgddqzlg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:10:34.471023Z' diff --git a/qai_hub_models/models/yolov7/requirements.txt b/qai_hub_models/models/yolov7/requirements.txt new file mode 100644 index 00000000..8e95168e --- /dev/null +++ b/qai_hub_models/models/yolov7/requirements.txt @@ -0,0 +1,6 @@ +matplotlib +opencv-python +PyYAML +requests +scipy +seaborn diff --git a/qai_hub_models/models/yolov7/test.py b/qai_hub_models/models/yolov7/test.py new file mode 100644 index 00000000..d4e2d47b --- /dev/null +++ b/qai_hub_models/models/yolov7/test.py @@ -0,0 +1,52 @@ +import numpy as np +import torch + +from qai_hub_models.models._shared.yolo.utils import detect_postprocess +from qai_hub_models.models.yolov7.app import YoloV7DetectionApp +from qai_hub_models.models.yolov7.demo import IMAGE_ADDRESS +from qai_hub_models.models.yolov7.demo import main as demo_main +from qai_hub_models.models.yolov7.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + YoloV7, + _load_yolov7_source_model_from_weights, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.image_processing import preprocess_PIL_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "yolov7_demo_640_output.png" +) +WEIGHTS = "yolov7-tiny.pt" + + +@skip_clone_repo_check +def test_task(): + """Verify that raw (numeric) outputs of both (QAIHM and non-qaihm) networks are the same.""" + processed_sample_image = preprocess_PIL_image(load_image(IMAGE_ADDRESS)) + source_model = _load_yolov7_source_model_from_weights(WEIGHTS) + qaihm_model = YoloV7.from_pretrained(WEIGHTS) + + with torch.no_grad(): + # original model output + source_model.model[-1].training = False + source_model.model[-1].export = False + source_detect_out = source_model(processed_sample_image)[0] + source_out_postprocessed = detect_postprocess(source_detect_out) + + # Qualcomm AI Hub Model output + qaihm_out_postprocessed = qaihm_model(processed_sample_image) + for i in range(0, len(source_out_postprocessed)): + assert np.allclose(source_out_postprocessed[i], qaihm_out_postprocessed[i]) + + +def test_yolov7_app(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS).convert("RGB") + app = YoloV7DetectionApp(YoloV7.from_pretrained(WEIGHTS)) + assert np.allclose(app.predict_boxes_from_image(image)[0], np.asarray(output_image)) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/yolov8_det/README.md b/qai_hub_models/models/yolov8_det/README.md new file mode 100644 index 00000000..d2bb0e5c --- /dev/null +++ b/qai_hub_models/models/yolov8_det/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Yolo-v8-Detection: Real-time object detection optimized for mobile and edge](https://aihub.qualcomm.com/models/yolov8_det) + +YoloV8 is a machine learning model that predicts bounding boxes and classes of objects in an image. + +This is based on the implementation of Yolo-v8-Detection found +[here](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/detect). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolov8_det). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[yolov8_det]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.yolov8_det.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.yolov8_det.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Yolo-v8-Detection can be found + [here](https://github.com/ultralytics/ultralytics/blob/main/LICENSE). + + +## References +* [Real-Time Flying Object Detection with YOLOv8](https://arxiv.org/abs/2305.09972) +* [Source Model Implementation](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/detect) diff --git a/qai_hub_models/models/yolov8_det/__init__.py b/qai_hub_models/models/yolov8_det/__init__.py new file mode 100644 index 00000000..34e81a7a --- /dev/null +++ b/qai_hub_models/models/yolov8_det/__init__.py @@ -0,0 +1,3 @@ +from .app import YoloV8DetectionApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import YoloV8Detector as Model # noqa: F401 diff --git a/qai_hub_models/models/yolov8_det/app.py b/qai_hub_models/models/yolov8_det/app.py new file mode 100644 index 00000000..57016de7 --- /dev/null +++ b/qai_hub_models/models/yolov8_det/app.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import torch + +from qai_hub_models.models._shared.yolo.app import YoloObjectDetectionApp + + +class YoloV8DetectionApp(YoloObjectDetectionApp): + def check_image_size(self, pixel_values: torch.Tensor) -> None: + """ + YoloV8 does not check for spatial dim shapes for input image + """ + pass diff --git a/qai_hub_models/models/yolov8_det/demo.py b/qai_hub_models/models/yolov8_det/demo.py new file mode 100644 index 00000000..dba0091b --- /dev/null +++ b/qai_hub_models/models/yolov8_det/demo.py @@ -0,0 +1,25 @@ +from qai_hub_models.models._shared.yolo.demo import yolo_detection_demo +from qai_hub_models.models.yolov8_det.app import YoloV8DetectionApp +from qai_hub_models.models.yolov8_det.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + YoloV8Detector, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/input_image.jpg" +) + + +def main(is_test: bool = False): + yolo_detection_demo( + YoloV8Detector, + YoloV8DetectionApp, + IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolov8_det/export.py b/qai_hub_models/models/yolov8_det/export.py new file mode 100644 index 00000000..b0a64ca5 --- /dev/null +++ b/qai_hub_models/models/yolov8_det/export.py @@ -0,0 +1,189 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.yolov8_det import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "yolov8_det" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "yolov8_det", + "Yolo-v8-Detection", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace( + model, make_torch_inputs(input_spec), check_trace=False + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics( + inference_job, inference_result, torch_out, outputs_to_skip=[2] + ) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolov8_det/info.yaml b/qai_hub_models/models/yolov8_det/info.yaml new file mode 100644 index 00000000..9e509e23 --- /dev/null +++ b/qai_hub_models/models/yolov8_det/info.yaml @@ -0,0 +1,35 @@ +name: Yolo-v8-Detection +# id must match with the model dir name in qai_hub_models +id: yolov8_det +status: public +headline: Real-time object detection optimized for mobile and edge. +domain: Computer Vision +use_case: Object Detection +description: YoloV8 is a machine learning model that predicts bounding boxes and classes of objects in an image. +tags: + - real-time +research_paper: https://arxiv.org/abs/2305.09972 +research_paper_title: Real-Time Flying Object Detection with YOLOv8 +license: https://github.com/ultralytics/ultralytics/blob/main/LICENSE +source_repo: https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/detect +technical_details: + Number of parameters: 3.2M + Model size: 7 MB + Model checkpoint: YoloV8-N + Input resolution: 640x640 +applicable_scenarios: + - Factory Automation + - Robotic Navigation + - Camera +related_models: + - 'yolov6' + - 'yolov7' +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: agpl-3.0 +dataset: [] diff --git a/qai_hub_models/models/yolov8_det/model.py b/qai_hub_models/models/yolov8_det/model.py new file mode 100644 index 00000000..5ead67a2 --- /dev/null +++ b/qai_hub_models/models/yolov8_det/model.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import torch +import torch.nn as nn +from ultralytics import YOLO as ultralytics_YOLO + +from qai_hub_models.models._shared.yolo.utils import ( + get_most_likely_score, + transform_box_layout_xywh2xyxy, +) +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ASSET_VERSION = 1 +MODEL_ID = __name__.split(".")[-2] + +SUPPORTED_WEIGHTS = [ + "yolov8n.pt", + "yolov8s.pt", + "yolov8m.pt", + "yolov8l.pt", + "yolov8x.pt", +] +DEFAULT_WEIGHTS = "yolov8n.pt" + + +class YoloV8Detector(BaseModel): + """Exportable YoloV8 bounding box detector, end-to-end.""" + + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + @classmethod + def from_pretrained(cls, ckpt_name: str = DEFAULT_WEIGHTS): + model = ultralytics_YOLO(ckpt_name).model + model.eval() + return cls(model) + + def forward(self, image: torch.Tensor): + """ + Run YoloV8 on `image`, and produce a predicted set of bounding boxes and associated class probabilities. + + Parameters: + image: Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + boxes: Shape [batch, num preds, 4] where 4 == (center_x, center_y, w, h) + class scores multiplied by confidence: Shape [batch, num_preds, # of classes (typically 80)] + """ + predictions, *_ = self.model(image) + boxes, scores, classes = yolov8_detect_postprocess(predictions) + return boxes, scores, classes + + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + height: int = 640, + width: int = 640, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, num_channels, height, width), "float32")} + + +def yolov8_detect_postprocess(detector_output: torch.Tensor): + """ + Post processing to break YoloV8 detector output into multiple, consumable tensors (eg. for NMS). + such as bounding boxes, scores and classes. + + Parameters: + detector_output: torch.Tensor + The output of Yolo Detection model + Shape is [batch, k, num_preds] + where, k = # of classes + 4 + k is structured as follows [boxes (4) : # of classes] + and boxes are co-ordinates [x_center, y_center, w, h] + + Returns: + boxes: torch.Tensor + Bounding box locations. Shape is [batch, num preds, 4] where 4 == (x1, y1, x2, y2) + scores: torch.Tensor + class scores multiplied by confidence: Shape is [batch, num_preds] + class_idx: torch.tensor + Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction. + """ + # Break output into parts + detector_output = torch.permute(detector_output, [0, 2, 1]) + boxes = detector_output[:, :, :4] + scores = detector_output[:, :, 4:] + + # Convert boxes to (x1, y1, x2, y2) + boxes = transform_box_layout_xywh2xyxy(boxes) + + # Get class ID of most likely score. + scores, class_idx = get_most_likely_score(scores) + + return boxes, scores, class_idx diff --git a/qai_hub_models/models/yolov8_det/perf.yaml b/qai_hub_models/models/yolov8_det/perf.yaml new file mode 100644 index 00000000..bbaddd57 --- /dev/null +++ b/qai_hub_models/models/yolov8_det/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Yolo-v8-Detection + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 9251.0 + throughput: 108.09642200843152 + estimated_peak_memory_range: + min: 233472 + max: 2649168 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 300 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 300 + job_id: j7gjr2q8p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 7043.0 + throughput: 141.9849495953429 + estimated_peak_memory_range: + min: 4939776 + max: 19565584 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 294 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 294 + job_id: jlpe7wy05 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:08:50.678067Z' diff --git a/qai_hub_models/models/yolov8_det/requirements.txt b/qai_hub_models/models/yolov8_det/requirements.txt new file mode 100644 index 00000000..5d6e5cf5 --- /dev/null +++ b/qai_hub_models/models/yolov8_det/requirements.txt @@ -0,0 +1 @@ +ultralytics==8.0.193 diff --git a/qai_hub_models/models/yolov8_det/test.py b/qai_hub_models/models/yolov8_det/test.py new file mode 100644 index 00000000..f82b4a93 --- /dev/null +++ b/qai_hub_models/models/yolov8_det/test.py @@ -0,0 +1,50 @@ +import numpy as np +import torch +from ultralytics import YOLO as ultralytics_YOLO + +from qai_hub_models.models.yolov8_det.app import YoloV8DetectionApp +from qai_hub_models.models.yolov8_det.demo import IMAGE_ADDRESS +from qai_hub_models.models.yolov8_det.demo import main as demo_main +from qai_hub_models.models.yolov8_det.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + YoloV8Detector, + yolov8_detect_postprocess, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.image_processing import preprocess_PIL_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/output_image.png" +) +WEIGHTS = "yolov8n.pt" + + +@skip_clone_repo_check +def test_task(): + """Verify that raw (numeric) outputs of both (QAIHM and non-qaihm) networks are the same.""" + processed_sample_image = preprocess_PIL_image(load_image(IMAGE_ADDRESS)) + source_model = ultralytics_YOLO(WEIGHTS).model + qaihm_model = YoloV8Detector.from_pretrained(WEIGHTS) + + with torch.no_grad(): + # original model output + source_detect_out, *_ = source_model(processed_sample_image) + source_out_postprocessed = yolov8_detect_postprocess(source_detect_out) + + # Qualcomm AI Hub Model output + qaihm_out_postprocessed = qaihm_model(processed_sample_image) + for i in range(0, len(source_out_postprocessed)): + assert np.allclose(source_out_postprocessed[i], qaihm_out_postprocessed[i]) + + +def test_yolov8_det_app(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = YoloV8DetectionApp(YoloV8Detector.from_pretrained(WEIGHTS)) + assert np.allclose(app.predict_boxes_from_image(image)[0], np.asarray(output_image)) + + +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/yolov8_seg/README.md b/qai_hub_models/models/yolov8_seg/README.md new file mode 100644 index 00000000..4ccdd84f --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/README.md @@ -0,0 +1,55 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Yolo-v8-Segmentation: Real-time object segmentation optimized for mobile and edge](https://aihub.qualcomm.com/models/yolov8_seg) + +YoloV8 is a machine learning model that predicts bounding boxes, segmentation masks and classes of objects in an image. + +This is based on the implementation of Yolo-v8-Segmentation found +[here](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/segment). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolov8_seg). + +[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[yolov8_seg]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.yolov8_seg.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../#qai-hub-models) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.yolov8_seg.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- Code in the Qualcomm® AI Hub Models repository is covered by the LICENSE + file at the repository root. +- The license for the original implementation of Yolo-v8-Segmentation can be found + [here](https://github.com/ultralytics/ultralytics/blob/main/LICENSE). + + +## References +* [Real-Time Flying Object Detection with YOLOv8](https://arxiv.org/abs/2305.09972) +* [Source Model Implementation](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/segment) diff --git a/qai_hub_models/models/yolov8_seg/__init__.py b/qai_hub_models/models/yolov8_seg/__init__.py new file mode 100644 index 00000000..8166dd3d --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/__init__.py @@ -0,0 +1,3 @@ +from .app import YoloV8SegmentationApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import YoloV8Segmentor as Model # noqa: F401 diff --git a/qai_hub_models/models/yolov8_seg/app.py b/qai_hub_models/models/yolov8_seg/app.py new file mode 100644 index 00000000..97d73b84 --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/app.py @@ -0,0 +1,199 @@ +from __future__ import annotations + +from typing import Callable, List, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image +from torchvision.transforms import Resize +from ultralytics.utils.ops import process_mask + +from qai_hub_models.utils.bounding_box_processing import batched_nms +from qai_hub_models.utils.draw import create_color_map +from qai_hub_models.utils.image_processing import app_to_net_image_inputs + + +class YoloV8SegmentationApp: + """ + This class consists of light-weight "app code" that is required to perform end to end inference + with YoloV8 segmentation model. + + For a given image input, the app will: + * pre-process the image (convert to range[0, 1]) + * Run Yolo inference + * By default, + - post-processes output using non-maximum-suppression + - applies predicted mask on input image + """ + + def __init__( + self, + model: Callable[ + [torch.Tensor], + Tuple[ + List[torch.Tensor], + List[torch.Tensor], + List[torch.Tensor], + List[torch.Tensor], + torch.Tensor, + ], + ], + nms_score_threshold: float = 0.45, + nms_iou_threshold: float = 0.7, + input_height: int = 640, + input_width: int = 640, + ): + """ + Initialize a YoloV8SegmentationApp application. + + Parameters: + model: torch.Tensor + YoloV8 segmentation model. + + Inputs: + Tensor of shape (N H W C x float32) with range [0, 1] and BGR channel layout. + + Outputs: + boxes: torch.Tensor + Bounding box locations. Shape is [batch, num preds, 4] where 4 == (x1, y1, x2, y2) + scores: torch.Tensor + Class scores multiplied by confidence: Shape is [batch, num_preds] + masks: torch.Tensor + Predicted masks: Shape is [batch, num_preds, 32] + classes: torch.Tensor + Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction. + protos: torch.Tensor + Tensor of shape[batch, 32, mask_h, mask_w] + Multiply masks and protos to generate output masks. + + nms_score_threshold + Score threshold for non maximum suppression. + + nms_iou_threshold + Intersection over Union threshold for non maximum suppression. + """ + self.model = model + self.nms_score_threshold = nms_score_threshold + self.nms_iou_threshold = nms_iou_threshold + self.input_height = input_height + self.input_width = input_width + + def check_image_size(self, pixel_values: torch.Tensor) -> None: + """ + Verify image size is valid model input. + """ + return all([s % 32 == 0 for s in pixel_values.shape[-2:]]) + + def preprocess_input(self, pixel_values: torch.Tensor) -> torch.Tensor: + img_size = (self.input_height, self.input_width) + return Resize(img_size)(pixel_values) + + def predict(self, *args, **kwargs): + # See predict_boxes_from_image. + return self.predict_segmentation_from_image(*args, **kwargs) + + def predict_segmentation_from_image( + self, + pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], + raw_output: bool = False, + ) -> Tuple[ + List[torch.Tensor], List[torch.Tensor], List[torch.Tensor], List[torch.Tensor] + ] | List[Image.Image]: + """ + From the provided image or tensor, predict the bounding boxes & classes of objects detected within. + + Parameters: + pixel_values_or_image: torch.Tensor + PIL image + or + numpy array (N H W C x uint8) or (H W C x uint8) -- both BGR channel layout + or + pyTorch tensor (N C H W x fp32, value range is [0, 1]), BGR channel layout + + raw_output: bool + See "returns" doc section for details. + + Returns: + If raw_output is false or pixel_values_or_image is not a PIL image, returns: + pred_boxes: List[torch.Tensor] + List of predicted boxes for all the batches. + Each pred_box is of shape [num_boxes, 4] + pred_scores: List[torch.Tensor] + List of scores for each predicted box for all the batches. + Each pred_score is of shape [num_boxes] + pred_masks: List[torch.Tensor] + List of predicted masks for all the batches. + Each pred_mask is of shape [num_boxes, 32] + pred_classes: List[torch.Tensor] + List of predicted class for all the batches. + Each pred_class is of shape [num_boxes] + + Otherwise, returns: + image_with_masks: List[PIL.Image] + Input image with predicted masks applied + """ + + # Input Prep + NHWC_int_numpy_frames, NCHW_fp32_torch_frames = app_to_net_image_inputs( + pixel_values_or_image + ) + + # Cache input spatial dimension to use for post-processing + input_h, input_w = NCHW_fp32_torch_frames.shape[2:] + NCHW_fp32_torch_frames = self.preprocess_input(NCHW_fp32_torch_frames) + + self.check_image_size(NCHW_fp32_torch_frames) + + # Run prediction + pred_boxes, pred_scores, pred_masks, pred_class_idx, proto = self.model( + NCHW_fp32_torch_frames + ) + + # Non Maximum Suppression on each batch + pred_boxes, pred_scores, pred_class_idx, pred_masks = batched_nms( + self.nms_iou_threshold, + self.nms_score_threshold, + pred_boxes, + pred_scores, + pred_class_idx, + pred_masks, + ) + + # Process mask and upsample to input shape + for batch_idx in range(len(pred_masks)): + pred_masks[batch_idx] = process_mask( + proto[batch_idx], + pred_masks[batch_idx], + pred_boxes[batch_idx], + (self.input_height, self.input_width), + upsample=True, + ).numpy() + + # Resize masks to match with input image shape + pred_masks = F.interpolate( + input=torch.Tensor(pred_masks), + size=(input_h, input_w), + mode="bilinear", + align_corners=False, + ) + + # Return raw output if requested + if raw_output or isinstance(pixel_values_or_image, torch.Tensor): + return (pred_boxes, pred_scores, pred_masks, pred_class_idx) + + # Create color map and convert segmentation mask to RGB image + pred_mask_img = torch.argmax(pred_masks, 1) + + # Overlay the segmentation masks on the image. + color_map = create_color_map(pred_mask_img.max().item() + 1) + out = [] + for i, img_tensor in enumerate(NHWC_int_numpy_frames): + out.append( + Image.blend( + Image.fromarray(img_tensor), + Image.fromarray(color_map[pred_mask_img[i]]), + alpha=0.5, + ) + ) + return out diff --git a/qai_hub_models/models/yolov8_seg/demo.py b/qai_hub_models/models/yolov8_seg/demo.py new file mode 100644 index 00000000..6e289795 --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/demo.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from typing import Type + +from qai_hub_models.models.yolov8_seg.app import YoloV8SegmentationApp +from qai_hub_models.models.yolov8_seg.model import ( + DEFAULT_WEIGHTS, + MODEL_ASSET_VERSION, + MODEL_ID, + YoloV8Segmentor, +) +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime +from qai_hub_models.utils.display import display_or_save_image + +WEIGHTS_HELP_MSG = f"YoloV8-Segment checkpoint name. Valid checkpoints can be found in qai_hub_models/{MODEL_ID}/model.py" + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/bus.jpg" +) +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/out_bus_with_mask.png" +) + + +def yolov8_seg_demo( + model_type: Type[BaseModel], + default_weights: str, + weights_help_msg: str, + default_image: CachedWebModelAsset, + stride_multiple: int | None = None, + is_test: bool = False, +): + # Demo parameters + parser = get_model_cli_parser(model_type) + parser = get_on_device_demo_parser( + parser, available_target_runtimes=[TargetRuntime.TFLITE], add_output_dir=True + ) + image_help = "image file path or URL." + if stride_multiple: + image_help = f"{image_help} Image spatial dimensions (x and y) must be multiples of {stride_multiple}." + + parser.add_argument( + "--image", + type=str, + help="Test image file path or URL", + ) + parser.add_argument( + "--score-threshold", + type=float, + default=0.45, + help="Score threshold for NonMaximumSuppression", + ) + parser.add_argument( + "--iou-threshold", + type=float, + default=0.7, + help="Intersection over Union (IoU) threshold for NonMaximumSuppression", + ) + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, model_type.get_model_id()) + + if args.image is None: + image_path = default_image.fetch() + else: + image_path = args.image + + # Load image & model + model = demo_model_from_cli_args(model_type, args, check_trace=False) + app = YoloV8SegmentationApp(model, args.score_threshold, args.iou_threshold) + + print("Model Loaded") + + image = load_image(image_path) + image_annotated = app.predict_segmentation_from_image(image)[0] + + if not is_test: + display_or_save_image(image_annotated, args.output_dir) + + +def main(is_test: bool = False): + yolov8_seg_demo( + YoloV8Segmentor, + DEFAULT_WEIGHTS, + WEIGHTS_HELP_MSG, + IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolov8_seg/export.py b/qai_hub_models/models/yolov8_seg/export.py new file mode 100644 index 00000000..bb5ae278 --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/export.py @@ -0,0 +1,189 @@ +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import List, Optional, Tuple + +import qai_hub as hub +import torch + +from qai_hub_models.models.yolov8_seg import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, + parse_target_runtime, +) +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + dst_runtime: str = "TFLITE", + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + dst_runtime: Which on-device runtime to target. Default is TensorFlowLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "yolov8_seg" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + target_runtime = parse_target_runtime(dst_runtime) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "yolov8_seg", + "Yolo-v8-Segmentation", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace( + model, make_torch_inputs(input_spec), check_trace=False + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image" + ) + print(f"Optimizing model {model_name} to run on-device.") + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + + # 3. Profile the model asset on real devices + profile_job = None + if not skip_profiling: + print(f"Profiling model {model_name} on a hosted device.") + profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 4. Run inference on-device with sample inputs + inference_job = None + if not skip_inferencing: + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options, + ) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model = compile_job.get_target_model() + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job.wait().success + profile_data = profile_job.download_profile() + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job.wait().success + inference_result = inference_job.download_output_data() + print_inference_metrics( + inference_job, inference_result, torch_out, outputs_to_skip=[3] + ) + + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolov8_seg/info.yaml b/qai_hub_models/models/yolov8_seg/info.yaml new file mode 100644 index 00000000..fa09b9fe --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/info.yaml @@ -0,0 +1,39 @@ +name: Yolo-v8-Segmentation +# id must match with the model dir name in qai_hub_models +id: yolov8_seg +status: public +headline: Real-time object segmentation optimized for mobile and edge. +domain: Computer Vision +use_case: Semantic Segmentation +description: YoloV8 is a machine learning model that predicts bounding boxes, segmentation masks and classes of objects in an image. +tags: + - real-time +research_paper: https://arxiv.org/abs/2305.09972 +research_paper_title: Real-Time Flying Object Detection with YOLOv8 +license: https://github.com/ultralytics/ultralytics/blob/main/LICENSE +source_repo: https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/segment +technical_details: + Number of parameters: 3.4M + Model size: 13.4 MB + Model checkpoint: YoloV8N-Seg + Input resolution: 640x640 +applicable_scenarios: + - Factory Automation + - Robotic Navigation + - Camera +related_models: + - unet_segmentation + - sam + - fastsam_x + - mediapipe_selfie + - yolov8_det + - ddrnet23_slim +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: no +license_type: agpl-3.0 +dataset: [] diff --git a/qai_hub_models/models/yolov8_seg/model.py b/qai_hub_models/models/yolov8_seg/model.py new file mode 100644 index 00000000..8b9a17da --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/model.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +import torch +import torch.nn as nn +from ultralytics import YOLO as ultralytics_YOLO + +from qai_hub_models.models._shared.yolo.utils import ( + get_most_likely_score, + transform_box_layout_xywh2xyxy, +) +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ASSET_VERSION = 1 +MODEL_ID = __name__.split(".")[-2] + +SUPPORTED_WEIGHTS = [ + "yolov8n-seg.pt", + "yolov8s-seg.pt", + "yolov8m-seg.pt", + "yolov8l-seg.pt", + "yolov8x-seg.pt", +] +DEFAULT_WEIGHTS = "yolov8n-seg.pt" + + +class YoloV8Segmentor(BaseModel): + """Exportable YoloV8 segmentor, end-to-end.""" + + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + @classmethod + def from_pretrained(cls, ckpt_name: str = DEFAULT_WEIGHTS): + if ckpt_name not in SUPPORTED_WEIGHTS: + raise ValueError( + f"Unsupported checkpoint name provided {ckpt_name}.\n" + f"Supported checkpoints are {list(SUPPORTED_WEIGHTS)}." + ) + model = ultralytics_YOLO(ckpt_name).model + model.eval() + return cls(model) + + def forward(self, image: torch.Tensor): + """ + Run YoloV8 on `image`, and produce a predicted set of bounding boxes and associated class probabilities. + + Parameters: + image: Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: RGB + + Returns: + boxes: torch.Tensor + Bounding box locations. Shape is [batch, num preds, 4] where 4 == (x1, y1, x2, y2) + scores: torch.Tensor + Class scores multiplied by confidence: Shape is [batch, num_preds] + masks: torch.Tensor + Predicted masks: Shape is [batch, num_preds, 32] + classes: torch.Tensor + Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction. + protos: torch.Tensor + Tensor of shape[batch, 32, mask_h, mask_w] + Multiply masks and protos to generate output masks. + """ + predictions = self.model(image) + boxes, scores, masks, classes = yolov8_segment_postprocess(predictions[0]) + return boxes, scores, masks, classes, predictions[1][-1] + + def get_input_spec( + self, + batch_size: int = 1, + num_channels: int = 3, + height: int = 640, + width: int = 640, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, num_channels, height, width), "float32")} + + +def yolov8_segment_postprocess(detector_output: torch.Tensor): + """ + Post processing to break YoloV8 detector output into multiple, consumable tensors (eg. for NMS). + such as bounding boxes, scores and classes. + + Parameters: + detector_output: torch.Tensor + The output of Yolo Detection model + Shape is [batch, k, num_preds] + where, k = # of classes + 4 + k is structured as follows [boxes (4) : # of classes] + and boxes are co-ordinates [x_center, y_center, w, h] + + Returns: + boxes: torch.Tensor + Bounding box locations. Shape is [batch, num preds, 4] where 4 == (x1, y1, x2, y2) + scores: torch.Tensor + Class scores multiplied by confidence: Shape is [batch, num_preds] + masks: torch.Tensor + Predicted masks: Shape is [batch, num_preds, 32] + class_idx: torch.Tensor + Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction. + """ + # Break output into parts + detector_output = torch.permute(detector_output, [0, 2, 1]) + boxes_idx, num_classes = 4, 80 + masks_dim = detector_output.shape[-1] - boxes_idx - num_classes + boxes = detector_output[:, :, :4] + scores = detector_output[:, :, 4 : boxes_idx + num_classes] + masks = detector_output[:, :, -masks_dim:] + + # Convert boxes to (x1, y1, x2, y2) + boxes = transform_box_layout_xywh2xyxy(boxes) + + # Get class ID of most likely score. + scores, class_idx = get_most_likely_score(scores) + + return boxes, scores, masks, class_idx diff --git a/qai_hub_models/models/yolov8_seg/perf.yaml b/qai_hub_models/models/yolov8_seg/perf.yaml new file mode 100644 index 00000000..6e25b242 --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/perf.yaml @@ -0,0 +1,67 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 +models: +- name: Yolo-v8-Segmentation + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 10686.0 + throughput: 93.58038555118847 + estimated_peak_memory_range: + min: 4616192 + max: 6819472 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 337 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 337 + job_id: jz57el6qp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-02-21T16:36:07.212007Z' diff --git a/qai_hub_models/models/yolov8_seg/requirements.txt b/qai_hub_models/models/yolov8_seg/requirements.txt new file mode 100644 index 00000000..5d6e5cf5 --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/requirements.txt @@ -0,0 +1 @@ +ultralytics==8.0.193 diff --git a/qai_hub_models/models/yolov8_seg/test.py b/qai_hub_models/models/yolov8_seg/test.py new file mode 100644 index 00000000..a111db59 --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/test.py @@ -0,0 +1,61 @@ +import numpy as np +import torch +from ultralytics import YOLO as ultralytics_YOLO + +from qai_hub_models.models.yolov8_seg.app import YoloV8SegmentationApp +from qai_hub_models.models.yolov8_seg.demo import IMAGE_ADDRESS, OUTPUT_IMAGE_ADDRESS +from qai_hub_models.models.yolov8_seg.demo import main as demo_main +from qai_hub_models.models.yolov8_seg.model import ( + YoloV8Segmentor, + yolov8_segment_postprocess, +) +from qai_hub_models.utils.asset_loaders import load_image +from qai_hub_models.utils.image_processing import preprocess_PIL_image +from qai_hub_models.utils.testing import assert_most_close + +WEIGHTS = "yolov8n-seg.pt" + + +def test_task(): + """Verify that raw (numeric) outputs of both (QAIHM and non-qaihm) networks are the same.""" + source_model = ultralytics_YOLO(WEIGHTS).model + qaihm_model = YoloV8Segmentor.from_pretrained(WEIGHTS) + qaihm_app = YoloV8SegmentationApp(qaihm_model) + processed_sample_image = preprocess_PIL_image(load_image(IMAGE_ADDRESS)) + processed_sample_image = qaihm_app.preprocess_input(processed_sample_image) + + with torch.no_grad(): + # original model output + source_out = source_model(processed_sample_image) + source_out_postprocessed = yolov8_segment_postprocess(source_out[0]) + source_out = [*source_out_postprocessed, source_out[1][-1]] + + # Qualcomm AI Hub Model output + qaihm_out_postprocessed = qaihm_model(processed_sample_image) + for i in range(0, len(source_out_postprocessed)): + assert np.allclose(source_out_postprocessed[i], qaihm_out_postprocessed[i]) + + +def test_trace(): + net = YoloV8Segmentor.from_pretrained(WEIGHTS) + input_spec = net.get_input_spec() + trace = net.convert_to_torchscript(input_spec, check_trace=False) + + # Collect output via app for traced model + img = load_image(IMAGE_ADDRESS) + app = YoloV8SegmentationApp(trace) + out_imgs = app.predict(img) + + expected_out = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_close( + np.asarray(out_imgs[0], dtype=np.float32), + np.asarray(expected_out, dtype=np.float32), + 0.005, + rtol=0.02, + atol=1.5, + ) + + +def test_demo(): + # Run demo and verify it does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/requirements-dev.txt b/qai_hub_models/requirements-dev.txt new file mode 100644 index 00000000..00cfe6e1 --- /dev/null +++ b/qai_hub_models/requirements-dev.txt @@ -0,0 +1,19 @@ +boto3 +botocore +coverage==6.5.0 +huggingface-hub==0.20.3 +jinja2==3.0.3 +mypy==0.991 +protobuf==3.20.3 +pytest-cov==4.1.0 +pytest-xdist==3.3.1 +pyyaml==6.0.1 +ruamel-yaml +schema==0.7.5 +scikit-image>=0.21.0 +tensorflow-cpu==2.13.0; sys_platform != 'darwin' +tensorflow-macos==2.13.0; sys_platform == 'darwin' +types-PyYAML +types-pillow +types-requests +keyrings.envvars; python_version >= '3.9' # used only by CI diff --git a/qai_hub_models/requirements.txt b/qai_hub_models/requirements.txt new file mode 100644 index 00000000..6ba28b51 --- /dev/null +++ b/qai_hub_models/requirements.txt @@ -0,0 +1,17 @@ +Pillow==10.0.1 +gdown==4.7.1 +gitpython +huggingface_hub +ipython +numpy==1.23.1 +opencv-python==4.8.1.78 +prettytable +pytest==7.4.2 +pyyaml +qai_hub +requests +requests_toolbelt +schema +torch==1.13.1 +torchvision<=0.14.1 +urllib3<2 diff --git a/qai_hub_models/test/__init__.py b/qai_hub_models/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/test/e2e/__init__.py b/qai_hub_models/test/e2e/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/test/e2e/test_aimet_compile.py b/qai_hub_models/test/e2e/test_aimet_compile.py new file mode 100644 index 00000000..3e6f7785 --- /dev/null +++ b/qai_hub_models/test/e2e/test_aimet_compile.py @@ -0,0 +1,39 @@ +import numpy as np +import pytest +import qai_hub as hub + +from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime +from qai_hub_models.utils.inference import compile_zoo_model_to_hub +from qai_hub_models.utils.measurement import get_model_size_mb +from qai_hub_models.utils.testing import skip_clone_repo_check_fixture # noqa: F401 + + +@pytest.mark.parametrize( + "source_model_format,target_runtime,expected_size_mb", + [ + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, 3.4), + (SourceModelFormat.TORCHSCRIPT, TargetRuntime.TFLITE, 3.4), + (SourceModelFormat.ONNX, TargetRuntime.QNN, 3.8), + (SourceModelFormat.TORCHSCRIPT, TargetRuntime.QNN, 3.8), + ], +) +def test_compile_aimet( + source_model_format, target_runtime, expected_size_mb, skip_clone_repo_check_fixture +): + model = MobileNetV2Quantizable.from_pretrained() + + calibration_data = model.get_calibration_data(target_runtime) + + device = hub.Device("Samsung Galaxy S23") + hub_model = compile_zoo_model_to_hub( + model=model, + device=device, + source_model_format=source_model_format, + target_runtime=target_runtime, + calibration_data=calibration_data, + ) + + # Make sure model is quantized + tgt_model_size_mb = get_model_size_mb(hub_model.model) + np.testing.assert_allclose(expected_size_mb, tgt_model_size_mb, rtol=0.1) diff --git a/qai_hub_models/test/test_async_compile_jobs.py b/qai_hub_models/test/test_async_compile_jobs.py new file mode 100644 index 00000000..efa791e2 --- /dev/null +++ b/qai_hub_models/test/test_async_compile_jobs.py @@ -0,0 +1,23 @@ +import os + +import qai_hub as hub +import yaml + + +def test_compile_jobs_success(): + """ + When testing compilation in CI, synchronously waiting for each compile_job to + finish is too slow. Instead, job ids are written to a file upon submission, + and success is validated all at once in the end using this test. + """ + if os.stat(os.environ["COMPILE_JOBS_FILE"]).st_size == 0: + return + with open(os.environ["COMPILE_JOBS_FILE"], "r") as f: + job_ids = yaml.safe_load(f.read()) + failed_jobs = {} + for name, job_id in job_ids.items(): + result = hub.get_job(job_id).wait() + if not result.success: + failed_jobs[name] = job_id + if failed_jobs: + raise ValueError(f"The following jobs failed to compile: {failed_jobs}") diff --git a/qai_hub_models/test/test_utils/__init__.py b/qai_hub_models/test/test_utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/test/test_utils/perf.yaml b/qai_hub_models/test/test_utils/perf.yaml new file mode 100644 index 00000000..c68d3abc --- /dev/null +++ b/qai_hub_models/test/test_utils/perf.yaml @@ -0,0 +1,59 @@ +models: +- name: AOTGAN + performance_metrics: + - reference_device_info: + name: Samsung Galaxy S23 Ultra + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-01-26T00:36:03.230526Z' + torchscript_onnx_tflite: + inference_time: 171647.0 + throughput: 5.8259101528136235 + estimated_peak_memory_range: + min: 3248128 + max: 6077152 + layer_info: + layers_on_npu: 243 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 243 + precision: fp16 + primary_compute_unit: NPU + job_id: jegnojxm5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 159404.0 + throughput: 6.273368296905975 + estimated_peak_memory_range: + min: 311296 + max: 41386272 + layer_info: + layers_on_npu: 283 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 283 + precision: fp16 + primary_compute_unit: NPU + job_id: jo5mojldg + job_status: Passed +aggregated: + supported_devices: + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy Tab S8 + supported_oses: + - Android + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 888 diff --git a/qai_hub_models/test/test_utils/test_info_specs.py b/qai_hub_models/test/test_utils/test_info_specs.py new file mode 100644 index 00000000..a1e1a841 --- /dev/null +++ b/qai_hub_models/test/test_utils/test_info_specs.py @@ -0,0 +1,106 @@ +from qai_hub_models.utils.config_loaders import ( + MODEL_DOMAIN, + MODEL_IDS, + MODEL_TAG, + MODEL_USE_CASE, + QAIHMModelInfo, +) + +HF_PIPELINE_TAGS = { + "text-classification", + "token-classification", + "table-question-answering", + "question-answering", + "zero-shot-classification", + "translation", + "summarization", + "conversational", + "feature-extraction", + "text-generation", + "text2text-generation", + "fill-mask", + "sentence-similarity", + "text-to-speech", + "text-to-audio", + "automatic-speech-recognition", + "audio-to-audio", + "audio-classification", + "voice-activity-detection", + "depth-estimation", + "image-classification", + "object-detection", + "image-segmentation", + "text-to-image", + "image-to-text", + "image-to-image", + "image-to-video", + "unconditional-image-generation", + "video-classification", + "reinforcement-learning", + "robotics", + "tabular-classification", + "tabular-regression", + "tabular-to-text", + "table-to-text", + "multiple-choice", + "text-retrieval", + "time-series-forecasting", + "text-to-video", + "visual-question-answering", + "document-question-answering", + "zero-shot-image-classification", + "graph-ml", + "mask-generation", + "zero-shot-object-detection", + "text-to-3d", + "image-to-3d", + "other", +} + + +def test_model_usecase_to_hf_pipeline_tag(): + for use_case in MODEL_USE_CASE: + assert use_case.map_to_hf_pipeline_tag() in HF_PIPELINE_TAGS + + +def test_info_spec(): + # Guard against MODEL_IDS being empty + assert ( + len(MODEL_IDS) > 0 + ), "Something went wrong. This test found no models to validate." + + for model_id in MODEL_IDS: + try: + info_spec = QAIHMModelInfo.from_model(model_id) + except Exception as err: + assert False, f"{model_id} config validation failed: {str(err)}" + + # Verify model ID is the same as folder name + assert ( + info_spec.id == model_id + ), f"{model_id} config ID does not match the model's folder name" + + # Validate spec + valid, reason = info_spec.validate() + assert valid, f"{model_id} config validation failed: {reason}" + + +def test_qaihm_domain(): + # Test " " is handled correctly and vice-versa + assert MODEL_DOMAIN.from_string("Computer Vision") == MODEL_DOMAIN.COMPUTER_VISION + assert MODEL_DOMAIN.COMPUTER_VISION.__str__() == "Computer Vision" + + +def test_qaihm_tags(): + # Test "-" is handled correctly and vice-versa + assert MODEL_TAG.from_string("real-time") == MODEL_TAG.REAL_TIME + assert MODEL_TAG.REAL_TIME.__str__() == "real-time" + + +def test_qaihm_usecases(): + # Test " " is handled correctly and vice-versa + assert ( + MODEL_USE_CASE.from_string("Image Classification") + == MODEL_USE_CASE.IMAGE_CLASSIFICATION + ) + assert MODEL_USE_CASE.IMAGE_CLASSIFICATION.__str__() == "Image Classification" diff --git a/qai_hub_models/test/test_utils/test_perf_summary.py b/qai_hub_models/test/test_utils/test_perf_summary.py new file mode 100644 index 00000000..98bc7a4a --- /dev/null +++ b/qai_hub_models/test/test_utils/test_perf_summary.py @@ -0,0 +1,202 @@ +import os + +import ruamel.yaml + +from qai_hub_models.utils.perf_summary import PerformanceSummary + +CHIPSET = "GEN2" +OS = "13" +MODEL_ID = "dummy" + + +def get_basic_speedup_report( + os_name: str = "Android", + onnx_tf_inference_time="null", + onnx_ort_qnn_inference_time=100.0, +): + return { + "models": [ + { + "name": "dummy", + "performance_metrics": [ + { + "reference_device_info": { + "os": OS, + "os_name": os_name, + "chipset": CHIPSET, + }, + "torchscript_onnx_tflite": { + "inference_time": onnx_tf_inference_time, + }, + "torchscript_onnx_qnn": { + "inference_time": 5.0, + }, + "torchscript_qnn": { + "inference_time": 5.0, + }, + }, + ], + }, + ] + } + + +def read_config(config_path): + yaml = ruamel.yaml.YAML() + yaml.preserve_quotes = True + yaml.preserve_yaml_order = True + with open(config_path, "r") as file: + return yaml.load(file) + + +def validate_perf_summary_is_empty(perf_summary): + # No difference captured + for _, val in perf_summary.progressions.items(): + assert len(val) == 0 + for _, val in perf_summary.regressions.items(): + assert len(val) == 0 + # No new reports captured + assert len(perf_summary.new_perf_report) == 0 + # No missing devices found in updated report + assert len(perf_summary.missing_devices) == 0 + + +def test_ios_excluded(): + # Set os_name to iOS to ensure it's not included in summary + prev_perf_metrics = get_basic_speedup_report(os_name="iOS") + new_perf_metrics = get_basic_speedup_report( + os_name="iOS", + onnx_tf_inference_time=10.0, + ) + + perf_summary = PerformanceSummary() + validate_perf_summary_is_empty(perf_summary) + + # Update perf summary + perf_summary.update_summary(MODEL_ID, prev_perf_metrics, new_perf_metrics) + + # Ensure no change in perf summary + validate_perf_summary_is_empty(perf_summary) + + +def test_model_inference_run_toggle(): + # Test model inference fail/pass toggle is captured + prev_perf_metrics = get_basic_speedup_report( + onnx_tf_inference_time="null", onnx_ort_qnn_inference_time=10.0 + ) + new_perf_metrics = get_basic_speedup_report( + onnx_tf_inference_time=10.0, onnx_ort_qnn_inference_time="null" + ) + + perf_summary = PerformanceSummary() + validate_perf_summary_is_empty(perf_summary) + + # Update perf summary + perf_summary.update_summary(MODEL_ID, prev_perf_metrics, new_perf_metrics) + + assert perf_summary.progressions["inf"] == [ + (MODEL_ID, "torchscript_onnx_tflite", "inf", 10.0, "null", CHIPSET, OS) + ] + + +def test_perf_progression_basic(): + prev_perf_metrics = get_basic_speedup_report( + onnx_tf_inference_time=10.0, onnx_ort_qnn_inference_time=5.123 + ) + new_perf_metrics = get_basic_speedup_report( + onnx_tf_inference_time=0.5, onnx_ort_qnn_inference_time=5.123 + ) + + perf_summary = PerformanceSummary() + validate_perf_summary_is_empty(perf_summary) + + # Update perf summary + perf_summary.update_summary(MODEL_ID, prev_perf_metrics, new_perf_metrics) + + expected_inf_bucket = [ + (MODEL_ID, "torchscript_onnx_tflite", 20.0, 0.5, 10.0, CHIPSET, OS), + ] + + assert perf_summary.progressions[10] == expected_inf_bucket + + +def test_perf_regression_basic(): + # Test regression in perf numbers + prev_perf_metrics = get_basic_speedup_report( + onnx_tf_inference_time=10.0, onnx_ort_qnn_inference_time=5.123 + ) + new_perf_metrics = get_basic_speedup_report( + onnx_tf_inference_time=20.0, onnx_ort_qnn_inference_time=5.123 + ) + + perf_summary = PerformanceSummary() + validate_perf_summary_is_empty(perf_summary) + + # Update perf summary + perf_summary.update_summary(MODEL_ID, prev_perf_metrics, new_perf_metrics) + + expected_inf_bucket = [ + (MODEL_ID, "torchscript_onnx_tflite", 2, 20.0, 10.0, CHIPSET, OS), + ] + + assert perf_summary.regressions[2] == expected_inf_bucket + + +def test_missing_devices(): + prev_perf_metrics = get_basic_speedup_report( + onnx_tf_inference_time=1.123, onnx_ort_qnn_inference_time=5.123 + ) + new_perf_metrics = get_basic_speedup_report( + onnx_tf_inference_time=0.372, onnx_ort_qnn_inference_time=5.123 + ) + + # Override chipset + new_perf_metrics["models"][0]["performance_metrics"][0]["reference_device_info"][ + "chipset" + ] = "diff-chip-xyz" + + perf_summary = PerformanceSummary() + validate_perf_summary_is_empty(perf_summary) + + # Update perf summary + perf_summary.update_summary(MODEL_ID, prev_perf_metrics, new_perf_metrics) + + assert len(perf_summary.missing_devices) == 1 + assert perf_summary.missing_devices[0] == (MODEL_ID, CHIPSET) + + +def test_empty_report(): + prev_perf_metrics = get_basic_speedup_report() + prev_perf_metrics["models"][0]["performance_metrics"][0][ + "reference_device_info" + ] = {} + new_perf_metrics = prev_perf_metrics + + perf_summary = PerformanceSummary() + validate_perf_summary_is_empty(perf_summary) + + # Update perf summary + perf_summary.update_summary(MODEL_ID, prev_perf_metrics, new_perf_metrics) + + assert len(perf_summary.empty_perf_report) == 1 + assert perf_summary.empty_perf_report[0] == (MODEL_ID,) + + +def test_e2e_aotgan_perf_summary_no_change(): + perf_filename = os.path.join(os.path.dirname(__file__), "perf.yaml") + + # Ensure perf.yaml is present, if moved, please make accordingly changes in the script. + assert os.path.exists(os.path.join(perf_filename)) + + perf_summary = PerformanceSummary() + validate_perf_summary_is_empty(perf_summary) + + existing_model_card = read_config(perf_filename) + perf_summary.update_summary( + "aotgan", + previous_report=existing_model_card, + new_report=existing_model_card, + ) + + # Ensure perf summary is empty + validate_perf_summary_is_empty(perf_summary) diff --git a/qai_hub_models/utils/__init__.py b/qai_hub_models/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/utils/aimet/__init__.py b/qai_hub_models/utils/aimet/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qai_hub_models/utils/aimet/config_loader.py b/qai_hub_models/utils/aimet/config_loader.py new file mode 100644 index 00000000..186d8199 --- /dev/null +++ b/qai_hub_models/utils/aimet/config_loader.py @@ -0,0 +1,11 @@ +from pathlib import Path + + +def get_default_aimet_config() -> str: + path = Path(__file__).parent / "default_config.json" + return str(path.resolve()) + + +def get_per_channel_aimet_config() -> str: + path = Path(__file__).parent / "default_config_per_channel.json" + return str(path.resolve()) diff --git a/qai_hub_models/utils/aimet/default_config.json b/qai_hub_models/utils/aimet/default_config.json new file mode 100644 index 00000000..f616005d --- /dev/null +++ b/qai_hub_models/utils/aimet/default_config.json @@ -0,0 +1,71 @@ +{ + "defaults": + { + "ops": + { + "is_output_quantized": "True" + }, + "params": + { + "is_quantized": "True", + "is_symmetric": "True" + }, + "strict_symmetric": "False", + "per_channel_quantization": "False" + }, + + "params": + { + "bias": + { + "is_quantized": "True" + } + }, + + "op_type": + { + "Squeeze": + { + "is_output_quantized": "False" + }, + "Pad": + { + "is_output_quantized": "False" + }, + "Mean": + { + "is_output_quantized": "False" + }, + "Gather": + { + "is_output_quantized": "False" + } + }, + + "supergroups": + [ + { + "op_list": ["Conv", "Relu"] + }, + { + "op_list": ["ConvTranspose", "Relu"] + }, + { + "op_list": ["Conv", "Clip"] + }, + { + "op_list": ["Add", "Relu"] + }, + { + "op_list": ["Gemm", "Relu"] + } + ], + + "model_input": + { + "is_input_quantized": "True" + }, + + "model_output": + {} +} diff --git a/qai_hub_models/utils/aimet/default_config_per_channel.json b/qai_hub_models/utils/aimet/default_config_per_channel.json new file mode 100644 index 00000000..b343a4a3 --- /dev/null +++ b/qai_hub_models/utils/aimet/default_config_per_channel.json @@ -0,0 +1,69 @@ +{ + "defaults": + { + "ops": + { + "is_output_quantized": "True", + "is_symmetric": "True" + }, + "params": + { + "is_quantized": "True", + "is_symmetric": "True" + }, + "strict_symmetric": "False", + "unsigned_symmetric": "False", + "per_channel_quantization": "True" + }, + + "params": + { + "bias": + { + "is_quantized": "True" + } + }, + + "op_type": + { + "Squeeze": + { + "is_output_quantized": "True" + }, + "Pad": + { + "is_output_quantized": "True" + }, + "Mean": + { + "is_output_quantized": "False" + } + }, + + "supergroups": + [ + { + "op_list": ["Conv", "Relu"] + }, + { + "op_list": ["Conv", "Clip"] + }, + { + "op_list": ["Conv", "BatchNormalization", "Relu"] + }, + { + "op_list": ["Add", "Relu"] + }, + { + "op_list": ["Gemm", "Relu"] + } + ], + + "model_input": + { + "is_input_quantized": "True" + }, + + "model_output": + {} +} diff --git a/qai_hub_models/utils/args.py b/qai_hub_models/utils/args.py new file mode 100644 index 00000000..8036163f --- /dev/null +++ b/qai_hub_models/utils/args.py @@ -0,0 +1,389 @@ +""" +Utility Functions for parsing input args for export and other customer facing scripts. +""" +from __future__ import annotations + +import argparse +import inspect +import os +import sys +from pydoc import locate +from typing import Any, List, Mapping, Optional, Type + +import qai_hub as hub + +from qai_hub_models.utils.base_model import ( + BaseModel, + FromPrecompiledTypeVar, + FromPretrainedMixin, + FromPretrainedTypeVar, + InputSpec, + TargetRuntime, +) +from qai_hub_models.utils.inference import HubModel, compile_zoo_model_to_hub +from qai_hub_models.utils.qai_hub_helpers import can_access_qualcomm_ai_hub + +DEFAULT_EXPORT_DEVICE = "Samsung Galaxy S23" + + +def parse_target_runtime(path: TargetRuntime | str) -> TargetRuntime: + return TargetRuntime[path.upper()] if isinstance(path, str) else path + + +def get_parser() -> argparse.ArgumentParser: + return argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + +def add_output_dir_arg(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser.add_argument( + "--output-dir", + "-o", + type=str, + default=None, + help="If specified, saves demo output (e.g. image) to this directory instead of displaying.", + ) + return parser + + +def get_on_device_demo_parser( + parser: argparse.ArgumentParser | None = None, + available_target_runtimes: List[TargetRuntime] = list( + TargetRuntime.__members__.values() + ), + add_output_dir: bool = False, +): + if not parser: + parser = get_parser() + + parser.add_argument( + "--on-device", + action="store_true", + help="If set, will evalute model using a Hub inference job instead of via torch.", + ) + parser.add_argument( + "--hub-model-id", + type=str, + default=None, + help="If running on-device, uses this model Hub model ID.", + ) + parser.add_argument( + "--device", + type=str, + default="Samsung Galaxy S23", + help="If running on-device, use this device.", + ) + if add_output_dir: + add_output_dir_arg(parser) + parser.add_argument( + "--device-os", + type=str, + default="", + help="Optionally specified together with --device", + ) + parser.add_argument( + "--inference-options", + type=str, + default="", + help="If running on-device, use these options when submitting the inference job.", + ) + default_runtime = ( + TargetRuntime.TFLITE + if TargetRuntime.TFLITE in available_target_runtimes + else available_target_runtimes[0] + ) + parser.add_argument( + "--target-runtime", + default=default_runtime.name, + help="The runtime to demo (if --on-device is specified). Default is TFLITE.", + choices=[x.name for x in available_target_runtimes], + ) + + return parser + + +def validate_on_device_demo_args(args: argparse.Namespace, model_name: str): + """ + Validates the the args for the on device demo are valid. + + Intended for use only in CLI scripts. + Prints error to console and exits if an error is found. + """ + if args.on_device and not can_access_qualcomm_ai_hub(): + print( + "On-device demos are not available without Qualcomm® AI Hub access.", + "Please sign up for Qualcomm® AI Hub at https://aihub.qualcomm.com/.", + sep=os.linesep, + ) + sys.exit(1) + + if (args.inference_options or args.hub_model_id) and not args.on_device: + print( + "A Hub model ID and inference options can be provided only if the --on-device flag is provided." + ) + sys.exit(1) + + +def get_model_cli_parser( + cls: Type[FromPretrainedTypeVar], parser: argparse.ArgumentParser | None = None +) -> argparse.ArgumentParser: + """ + Generate the argument parser to create this model from an argparse namespace. + Default behavior is to assume the CLI args have the same names as from_pretrained method args. + """ + if not parser: + parser = get_parser() + + from_pretrained_sig = inspect.signature(cls.from_pretrained) + for name, param in from_pretrained_sig.parameters.items(): + if name == "cls": + continue + # Determining type from param.annotation is non-trivial (it can be a + # strings like "Optional[str]" or "bool | None"). + if param.default is not None: + type_ = type(param.default) + elif param.annotation == "bool": + type_ = bool + else: + type_ = str + parser.add_argument( + f"--{name.replace('_', '-')}", + type=type_, + default=param.default, + help=f"For documentation, see {cls.__name__}::from_pretrained.", + ) + return parser + + +def get_model_kwargs( + model_cls: Type[FromPretrainedTypeVar], args_dict: Mapping[str, Any] +) -> Mapping[str, Any]: + """ + Given a dict with many args, pull out the ones relevant + to constructing the model via `from_pretrained`. + """ + from_pretrained_sig = inspect.signature(model_cls.from_pretrained) + model_kwargs = {} + for name in from_pretrained_sig.parameters: + if name == "cls" or name not in args_dict: + continue + model_kwargs[name] = args_dict.get(name) + return model_kwargs + + +def model_from_cli_args( + model_cls: Type[FromPretrainedTypeVar], cli_args: argparse.Namespace +) -> FromPretrainedTypeVar: + """ + Create this model from an argparse namespace. + Default behavior is to assume the CLI args have the same names as from_pretrained method args. + """ + return model_cls.from_pretrained(**get_model_kwargs(model_cls, vars(cli_args))) + + +def demo_model_from_cli_args( + model_cls: Type[FromPretrainedTypeVar], + cli_args: argparse.Namespace, + check_trace: bool = True, +) -> FromPretrainedTypeVar | HubModel: + """ + Create this model from an argparse namespace. + Default behavior is to assume the CLI args have the same names as from_pretrained method args. + + If the model is a BaseModel and an on-device demo is requested, the BaseModel will be wrapped in a HubModel. + """ + model = model_from_cli_args( + model_cls, cli_args + ) # TODO(9494): This should be replaced by static input spec + is_on_device = "on_device" in cli_args and cli_args.on_device + target_runtime = TargetRuntime[cli_args.target_runtime] + inference_model: FromPretrainedTypeVar | HubModel + if is_on_device and isinstance(model, BaseModel): + + device = hub.Device(cli_args.device, cli_args.device_os) + if cli_args.hub_model_id: + model_from_hub = hub.get_model(cli_args.hub_model_id) + inference_model = HubModel( + model_from_hub, + list(model.get_input_spec().keys()), + device, + cli_args.inference_options, + ) + else: + inference_model = compile_zoo_model_to_hub( + model=model, + device=device, + source_model_format=model.preferred_hub_source_model_format( + target_runtime + ), + target_runtime=target_runtime, + check_trace=check_trace, + inference_options=cli_args.inference_options, + ) + else: + inference_model = model + return inference_model + + +def get_input_spec_kwargs( + model: "BaseModel", args_dict: Mapping[str, Any] +) -> Mapping[str, Any]: + """ + Given a dict with many args, pull out the ones relevant + to constructing the model's input_spec. + """ + get_input_spec_args = inspect.signature(model.get_input_spec) + input_spec_kwargs = {} + for name in get_input_spec_args.parameters: + if name == "self" or name not in args_dict: + continue + input_spec_kwargs[name] = args_dict[name] + return input_spec_kwargs + + +def get_model_input_spec_parser( + model_cls: Type[BaseModel], parser: argparse.ArgumentParser | None = None +) -> argparse.ArgumentParser: + """ + Generate the argument parser to get this model's input spec from an argparse namespace. + Default behavior is to assume the CLI args have the same names as get_input_spec method args. + """ + if not parser: + parser = get_parser() + + get_input_spec_sig = inspect.signature(model_cls.get_input_spec) + for name, param in get_input_spec_sig.parameters.items(): + if name == "self": + continue + type_: type | object + if isinstance(param.annotation, type): + type_ = param.annotation + else: + # locate() converts string type to cls type + # Any type can be resolved as long as it's accessible in this scope + type_ = locate(param.annotation) + assert isinstance(type_, type) + parser.add_argument( + f"--{name.replace('_', '-')}", + type=type_, + default=param.default, + help=f"For documentation, see {model_cls.__name__}::get_input_spec.", + ) + return parser + + +def input_spec_from_cli_args( + model: "BaseModel", cli_args: argparse.Namespace +) -> "InputSpec": + """ + Create this model's input spec from an argparse namespace. + Default behavior is to assume the CLI args have the same names as get_input_spec method args. + """ + return model.get_input_spec(**get_input_spec_kwargs(model, vars(cli_args))) + + +def export_parser( + model_cls: Type[FromPretrainedTypeVar] | Type[FromPrecompiledTypeVar], + components: Optional[List[str]] = None, + supports_qnn=True, + exporting_compiled_model=False, +) -> argparse.ArgumentParser: + """ + Arg parser to be used in export scripts. + + Parameters: + model_cls: Class of the model to be exported. Used to add additional + args for model instantiation. + components: Some models have multiple components that need to be + compiled separately. This represents the list of options for the user to + select which components they want to compile. + supports_qnn: + Whether QNN export is supported. + Default=True. + exporting_compiled_model: + True when exporting compiled model. + If set, removing skip_profiling flag from export arguments. + Default = False. + + Returns: + Arg parser object. + """ + parser = get_parser() + parser.add_argument( + "--device", + type=str, + default=DEFAULT_EXPORT_DEVICE, + help="Device for which to export.", + ) + parser.add_argument( + "--skip-profiling", + action="store_true", + help="If set, writes compiled model to local directory without profiling.", + ) + parser.add_argument( + "--skip-inferencing", + action="store_true", + help="If set, skips verifying on-device output vs local cpu.", + ) + if not exporting_compiled_model: + parser.add_argument( + "--skip-downloading", + action="store_true", + help="If set, skips downloading of compiled model.", + ) + parser.add_argument( + "--skip-summary", + action="store_true", + help="If set, skips printing summary of inference and profiling.", + ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory to store generated assets (e.g. compiled model). " + "Defaults to `/build/`.", + ) + if not exporting_compiled_model: + # Default runtime for compiled model is fixed for given model + parser.add_argument( + "--dst-runtime", + default="TFLITE", + help="The runtime to export for. Default is TF Lite.", + choices=TargetRuntime._member_names_ + if supports_qnn + else [TargetRuntime.TFLITE.name], + ) + # No compilation for compiled models + parser.add_argument( + "--compile-options", + type=str, + default="", + help="Additional options to pass when submitting the compile job.", + ) + parser.add_argument( + "--profile-options", + type=str, + default="", + help="Additional options to pass when submitting the profile job.", + ) + if components is not None: + parser.add_argument( + "--components", + nargs="+", + type=str, + default=None, + choices=components, + help="Which components of the model to be exported.", + ) + + if issubclass(model_cls, FromPretrainedMixin): + # Skip adding CLI from model for compiled model + # TODO: #9408 Refactor BaseModel, BasePrecompiledModel to fetch + # parameters from compiled model + parser = get_model_cli_parser(model_cls, parser) + + if issubclass(model_cls, BaseModel): + parser = get_model_input_spec_parser(model_cls, parser) + + return parser diff --git a/qai_hub_models/utils/asset_loaders.py b/qai_hub_models/utils/asset_loaders.py new file mode 100644 index 00000000..32456ce0 --- /dev/null +++ b/qai_hub_models/utils/asset_loaders.py @@ -0,0 +1,971 @@ +from __future__ import annotations + +import fileinput +import json +import os +import shutil +import sys +import tarfile +import tempfile +import threading +import time +from contextlib import contextmanager +from enum import Enum +from functools import partial +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Union +from zipfile import ZipFile + +import gdown +import numpy as np +import requests +import torch +import yaml +from git import Repo +from PIL import Image +from schema import And, Schema, SchemaError + +ASSET_BASES_DEFAULT_PATH = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "asset_bases.yaml" +) + +QAIHM_STORE_ROOT = os.environ.get("QAIHM_STORE_ROOT", os.path.expanduser("~")) +LOCAL_STORE_DEFAULT_PATH = os.path.join(QAIHM_STORE_ROOT, ".qaihm") + +SOURCE_AS_ROOT_LOCK = threading.Lock() + +VersionType = Union[str, int] + +# If non-None, always enter this for yes (True)/no (False) prompts +_always_answer = None + + +@contextmanager +def always_answer_prompts(answer): + global _always_answer + old_value = _always_answer + _always_answer = answer + try: + yield + finally: + _always_answer = old_value + + +class QAIHM_WEB_ASSET(Enum): + STATIC_IMG = 0 + ANIMATED_MOV = 1 + + +class ModelZooAssetConfig: + def __init__( + self, + asset_url: str, + web_asset_folder: str, + static_web_banner_filename: str, + animated_web_banner_filename: str, + model_asset_folder: str, + dataset_asset_folder: str, + local_store_path: str, + qaihm_repo: str, + example_use: str, + huggingface_path: str, + repo_url: str, + models_website_url: str, + models_website_relative_path: str, + ) -> None: + self.local_store_path = local_store_path + self.asset_url = asset_url + self.web_asset_folder = web_asset_folder + self.static_web_banner_filename = static_web_banner_filename + self.animated_web_banner_filename = animated_web_banner_filename + self.model_asset_folder = model_asset_folder + self.dataset_asset_folder = dataset_asset_folder + self.qaihm_repo = qaihm_repo + self.example_use = example_use + self.huggingface_path = huggingface_path + self.repo_url = repo_url + self.models_website_url = models_website_url + self.models_website_relative_path = models_website_relative_path + + # Validation + for name in [ + self.asset_url, + self.web_asset_folder, + self.model_asset_folder, + self.static_web_banner_filename, + self.animated_web_banner_filename, + self.local_store_path, + self.qaihm_repo, + self.example_use, + self.huggingface_path, + self.models_website_relative_path, + ]: + assert not name.endswith("/") and not name.endswith("\\") + for name in [ + self.static_web_banner_filename, + self.animated_web_banner_filename, + ]: + assert not name.startswith("/") and not name.startswith("\\") + + for name in [self.repo_url, self.models_website_url]: + assert not name.endswith("/"), "URLs should not end with a slash" + + def get_hugging_face_url(self, model_name: str) -> str: + return f"https://huggingface.co/{self.get_huggingface_path(model_name)}" + + def get_huggingface_path(self, model_name: str) -> str: + return self.huggingface_path.replace("{model_name}", str(model_name)) + + def get_web_asset_url(self, model_id: str, type: QAIHM_WEB_ASSET): + if type == QAIHM_WEB_ASSET.STATIC_IMG: + file = self.static_web_banner_filename + elif type == QAIHM_WEB_ASSET.ANIMATED_MOV: + file = self.animated_web_banner_filename + else: + raise NotImplementedError("unsupported web asset type") + return f"{self.asset_url}/{ModelZooAssetConfig._replace_path_keywords(self.web_asset_folder, model_id=model_id)}/{file}" + + def get_local_store_model_path( + self, model_name: str, version: VersionType, filename: str + ) -> str: + model_dir = os.path.join( + self.local_store_path, + self.get_relative_model_asset_path(model_name, version, filename), + ) + return model_dir + + def get_local_store_dataset_path( + self, dataset_name: str, version: VersionType, filename: str + ) -> str: + model_dir = os.path.join( + self.local_store_path, + self.get_relative_dataset_asset_path(dataset_name, version, filename), + ) + return model_dir + + def get_relative_model_asset_path( + self, model_id: str, version: Union[int, str], file_name: str + ): + assert not file_name.startswith("/") and not file_name.startswith("\\") + return f"{ModelZooAssetConfig._replace_path_keywords(self.model_asset_folder, model_id=model_id, version=version)}/{file_name}" + + def get_relative_dataset_asset_path( + self, dataset_id: str, version: Union[int, str], file_name: str + ): + assert not file_name.startswith("/") and not file_name.startswith("\\") + return f"{ModelZooAssetConfig._replace_path_keywords(self.dataset_asset_folder, dataset_id=dataset_id, version=version)}/{file_name}" + + def get_model_asset_url( + self, model_id: str, version: Union[int, str], file_name: str + ): + assert not file_name.startswith("/") and not file_name.startswith("\\") + return f"{self.asset_url}/{self.get_relative_model_asset_path(model_id, version, file_name)}" + + def get_dataset_asset_url( + self, dataset_id: str, version: Union[int, str], file_name: str + ): + assert not file_name.startswith("/") and not file_name.startswith("\\") + return f"{self.asset_url}/{self.get_relative_dataset_asset_path(dataset_id, version, file_name)}" + + def get_qaihm_repo(self, model_id: str, relative=True): + relative_path = f"{ModelZooAssetConfig._replace_path_keywords(self.qaihm_repo, model_id=model_id)}" + if not relative: + return self.repo_url + "/" + relative_path + + return relative_path + + def get_website_url(self, model_id: str, relative=False): + relative_path = f"{ModelZooAssetConfig._replace_path_keywords(self.models_website_relative_path, model_id=model_id)}" + if not relative: + return self.models_website_url + "/" + relative_path + return relative_path + + def get_example_use(self, model_id: str): + return f"{ModelZooAssetConfig._replace_path_keywords(self.example_use, model_id=model_id)}" + + ### + # Helpers + ### + @staticmethod + def _replace_path_keywords( + path: str, + model_id: Optional[str] = None, + dataset_id: Optional[str] = None, + version: Optional[Union[int, str]] = None, + ): + if model_id: + path = path.replace("{model_id}", model_id) + if dataset_id: + path = path.replace("{dataset_id}", dataset_id) + if version: + path = path.replace("{version}", str(version)) + return path + + ### + # Load from CFG + ### + @staticmethod + def from_cfg( + asset_cfg_path: str = ASSET_BASES_DEFAULT_PATH, + local_store_path: str = LOCAL_STORE_DEFAULT_PATH, + verify_env_has_all_variables: bool = False, + ): + # Load CFG and params + asset_cfg = ModelZooAssetConfig.load_asset_cfg( + asset_cfg_path, verify_env_has_all_variables + ) + + return ModelZooAssetConfig( + asset_cfg["store_url"], + asset_cfg["web_asset_folder"], + asset_cfg["static_web_banner_filename"], + asset_cfg["animated_web_banner_filename"], + asset_cfg["model_asset_folder"], + asset_cfg["dataset_asset_folder"], + local_store_path, + asset_cfg["qaihm_repo"], + asset_cfg["example_use"], + asset_cfg["huggingface_path"], + asset_cfg["repo_url"], + asset_cfg["models_website_url"], + asset_cfg["models_website_relative_path"], + ) + + ASSET_CFG_SCHEMA = Schema( + And( + { + "store_url": str, + "web_asset_folder": str, + "dataset_asset_folder": str, + "static_web_banner_filename": str, + "animated_web_banner_filename": str, + "model_asset_folder": str, + "qaihm_repo": str, + "example_use": str, + "huggingface_path": str, + "repo_url": str, + "models_website_url": str, + "models_website_relative_path": str, + } + ) + ) + + @staticmethod + def load_asset_cfg(path, verify_env_has_all_variables: bool = False): + with open(path) as f: + data = yaml.safe_load(f) + try: + # Validate high level-schema + ModelZooAssetConfig.ASSET_CFG_SCHEMA.validate(data) + except SchemaError as e: + assert 0, f"{e.code} in {path}" + + for key, value in data.items(): + # Environment variable replacement + if isinstance(value, str) and value.startswith("env::"): + values = value.split("::") + if len(values) == 2: + _, env_var_name = values + default = value + elif len(values) == 3: + _, env_var_name, default = values + else: + raise NotImplementedError( + "Environment vars should be specified in asset_bases " + "using format env::::" + ) + + data[key] = os.environ.get(env_var_name, default) + if ( + verify_env_has_all_variables + and default == value + and env_var_name not in os.environ + ): + raise ValueError( + f"Environment variable '{env_var_name}' was specified in " + f"asset_bases.yaml for key '{key}', but is not defined." + ) + + return data + + +ASSET_CONFIG = ModelZooAssetConfig.from_cfg() + + +def _query_yes_no(question, default="yes"): + """ + Ask a yes/no question and return their answer. + + "question" is a string that is presented to the user. + "default" is the presumed answer if the user just hits . + It must be "yes" (the default), "no" or None (meaning + an answer is required of the user). + + The "answer" return value is True for "yes" or False for "no". + + Sourced from https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input + """ + global _always_answer + if _always_answer is not None: + return _always_answer + + valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} + if default is None: + prompt = " [y/n] " + elif default == "yes": + prompt = " [Y/n] " + elif default == "no": + prompt = " [y/N] " + else: + raise ValueError("invalid default answer: '%s'" % default) + + while True: + print(question + prompt, end="") + choice = input().lower() + if default is not None and choice == "": + return valid[default] + elif choice in valid: + return valid[choice] + else: + print("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") + + +def maybe_clone_git_repo( + git_file_path: str, + commit_hash, + model_name: str, + model_version: VersionType, + patches: List[str] = [], +) -> str: + """Clone (or pull) a repository, save it to disk in a standard location, + and return the absolute path to the cloned location. Patches can be applied + by providing a list of paths to diff files.""" + + # http://blah.come/author/name.git -> name, author + repo_name = os.path.basename(git_file_path).split(".")[0] + repo_author = os.path.basename(os.path.dirname(git_file_path)) + local_path = ASSET_CONFIG.get_local_store_model_path( + model_name, model_version, f"{repo_author}_{repo_name}_git" + ) + os.makedirs(os.path.dirname(local_path), exist_ok=True) + + if not os.path.exists(os.path.join(local_path, ".git")): + # Clone repo + should_clone = _query_yes_no( + f"{model_name} requires repository {git_file_path} . Ok to clone?", + ) + if should_clone: + print(f"Cloning {git_file_path} to {local_path}...") + repo = Repo.clone_from(git_file_path, local_path) + repo.git.checkout(commit_hash) + for patch in patches: + repo.git.execute(["git", "apply", patch]) + print("Done") + else: + raise ValueError( + f"Unable to load {model_name} without its required repository." + ) + + return local_path + + +def _load_file( + file: PathType, + loader_func: Callable[[str], Any], + dst_folder_path: tempfile.TemporaryDirectory | str | None = None, +) -> Any: + if isinstance(file, (str, Path)): + file = str(file) + if file.startswith("http"): + if dst_folder_path is None: + dst_folder_path = tempfile.TemporaryDirectory() + if isinstance(dst_folder_path, tempfile.TemporaryDirectory): + dst_folder_path_str = dst_folder_path.name + else: + dst_folder_path_str = dst_folder_path + dst_path = os.path.join(dst_folder_path_str, os.path.basename(file)) + download_file(file, dst_path) + return loader_func(dst_path) + else: + return loader_func(file) + elif isinstance(file, CachedWebAsset): + return loader_func(str(file.fetch())) + else: + raise NotImplementedError() + + +def load_image(image: PathType, verbose=False, desc="image") -> Image.Image: + if verbose: + print(f"Loading {desc} from {image}") + return _load_file(image, Image.open) + + +def load_numpy(file: PathType) -> Any: + return _load_file(file, np.load) + + +def load_torch(pt: PathType) -> Any: + return _load_file(pt, partial(torch.load, map_location="cpu")) + + +def load_json(json_file: PathType) -> Dict: + def _load_json_helper(file_path) -> Any: + with open(file_path, "r") as json_file: + return json.load(json_file) + + return _load_file(json_file, _load_json_helper) + + +def load_path(file: PathType, tmpdir: tempfile.TemporaryDirectory | str) -> str | Path: + """ + Get asset path on disk. + If `file` is a string URL, downloads the file to tmpdir.name. + """ + + def return_path(path): + return path + + return _load_file(file, return_path, tmpdir) + + +@contextmanager +def SourceAsRoot( + source_repo_url: str, + source_repo_commit_hash: str, + source_repo_name: str, + source_repo_version: int | str, + source_repo_patches: List[str] = [], + keep_sys_path: bool = False, +): + """ + Context manager that runs code with: + * the source repository added to the system path, + * cwd set to the source repo's root directory. + + Only one of this class should be active per Python session. + """ + + repository_path = maybe_clone_git_repo( + source_repo_url, + source_repo_commit_hash, + source_repo_name, + source_repo_version, + patches=source_repo_patches, + ) + SOURCE_AS_ROOT_LOCK.acquire() + cwd = os.getcwd() + original_path = list(sys.path) + try: + # Patch path for this load only, since the model source + # code references modules via a global scope. + # Insert with highest priority (see #7666) + sys.path.insert(0, repository_path) + os.chdir(repository_path) + + yield repository_path + finally: + # Be careful editing these lines (failure means partial clean-up) + os.chdir(cwd) + if not keep_sys_path: + sys.path = original_path + SOURCE_AS_ROOT_LOCK.release() + + +def find_replace_in_repo( + repo_path: str, filepaths: Union[str, List[str]], find_str: str, replace_str: str +): + """ + When loading models from external repos, sometimes small modifications + need to be made to the repo code to get it working in the zoo env. + + This does a simple find + replace within a single file. + + Parameters: + repo_path: Local filepath to the repo of interest. + filepath: Filepath within the repo to the file to change. + find_str: The string that needs to be replaced. + replace_str: The string with which to replace all instances of `find_str`. + """ + if isinstance(filepaths, str): + filepaths = [filepaths] + for filepath in filepaths: + with fileinput.FileInput( + Path(repo_path) / filepath, + inplace=True, + backup=".bak", + ) as file: + for line in file: + print(line.replace(find_str, replace_str), end="") + + +class CachedWebAsset: + """ + Helper class for downloading files for storage in the QAIHM asset cache. + """ + + def __init__( + self, + url: str, + local_cache_path: str, + asset_config=ASSET_CONFIG, + model_downloader: Callable[[str, str, int], str] | None = None, + downloader_num_retries=4, + ): + self.url = url + self.local_cache_path = local_cache_path + self.asset_config: ModelZooAssetConfig = asset_config + self._downloader: Callable = model_downloader or download_file + self.downloader_num_retries = downloader_num_retries + + # Append file name to local path if no file name is present + path, ext = os.path.splitext(self.local_cache_path) + if not ext: + file_name = self.url.rsplit("/", 1)[-1] + self.local_cache_path = os.path.join(path, file_name) + + # Set is_extracted if already extracted on disk + file, _ = os.path.splitext(self.local_cache_path) + self.is_extracted = list( + filter(local_cache_path.endswith, [".zip", ".tar", ".tar.gz", ".tgz"]) + ) != [] and os.path.isdir(file) + + def __repr__(self): + return self.url + + @staticmethod + def from_asset_store( + relative_store_file_path: str, num_retries=4, asset_config=ASSET_CONFIG + ): + """ + File from the online qaihm asset store. + + Parameters: + relative_store_file_path: Path relative to `qai_hub_models` cache root to store this asset. + (also relative to the root of the online file store) + + num_retries: Number of retries when downloading thie file. + + asset_config: Asset config to use to save this file. + """ + web_store_path = f"{asset_config.asset_url}/{relative_store_file_path}" + return CachedWebAsset( + web_store_path, + relative_store_file_path, + asset_config, + download_file, + num_retries, + ) + + @staticmethod + def from_google_drive( + gdrive_file_id: str, + relative_store_file_path: str, + num_retries=4, + asset_config=ASSET_CONFIG, + ): + """ + File from google drive. + + Parameters: + gdrive_file_id: Unique identifier of the file in Google Drive. + Typically found in the URL. + + relative_store_file_path: Path relative to `qai_hub_models` cache root to store this asset. + + num_retries: Number of retries when downloading thie file. + + asset_config: Asset config to use to save this file. + """ + return CachedWebAsset( + f"https://drive.google.com/uc?id={gdrive_file_id}", + relative_store_file_path, + asset_config, + download_and_cache_google_drive, + num_retries, + ) + + def path(self, extracted=None) -> Path: + """ + Get the path of this asset on disk. + + By default, for archived (.zip, .tar, .etc) assets, path() will return the extracted path if the asset + has been extracted, and the original archive file's path if it has not been extracted. + + Parameters: + extracted: If true, return the path of the extracted asset on disk. + If false, return the path of the archive path on disk. + """ + if (extracted is None and self.is_extracted) or extracted: + file, _ = os.path.splitext(self.local_cache_path) + else: + file = self.local_cache_path + + return Path(self.asset_config.local_store_path) / file + + def fetch(self, force=False, extract=False) -> Path: + """ + Fetch this file from the web if it does not exist on disk. + + Parameters: + force: If the file exists on disk already, discard it and download it again. + + extract: Extract the asset after downloading it. + """ + path = self.path() + + # Delete existing asset if requested + if path.exists(): + if force: + if os.path.isdir(path): + shutil.rmtree(path) + else: + os.remove(path) + self.is_extracted = False + else: + return path + elif self.is_extracted: + # Someone deleted the extracted path. Fetch it again. + self.is_extracted = False + path = self.path() + + # Create dirs + os.makedirs(os.path.dirname(path), exist_ok=True) + + # Downloader should return path we expect. + p1 = self._downloader(self.url, self.local_cache_path) + assert str(p1) == str(path) + + # Extract asset if requested + if extract: + self.extract(force) + + return self.path() + + def extract(self, force=True) -> Path: + """ + Extract this asset if it is compressed. Updates the path of this asset to the folder to which the zip file was extracted. + """ + if self.is_extracted: + if force: + os.remove(self.path()) + self.is_extracted = False + else: + return self.path() + + _, ext = os.path.splitext(self.local_cache_path) + if ext == ".zip": + # Update local cache path to pont to the extracted zip folder. + extract_zip_file(str(self.path())) + os.remove(self.path()) # Deletes zip file + self.is_extracted = True # Updates path() to return extracted path + elif ext in [".tar", ".gz", ".tgz"]: + with tarfile.open(self.path()) as f: + f.extractall(os.path.dirname(self.path())) + os.remove(self.path()) # Deletes tar file + self.is_extracted = True # Updates path() to return extracted path + else: + raise ValueError(f"Unsupported compressed file type: {ext}") + + return self.path() + + +class CachedWebModelAsset(CachedWebAsset): + """ + Helper class for downloading files for storage in the QAIHM asset cache. + """ + + def __init__( + self, + url: str, + model_id: str, + model_asset_version: int | str, + filename: str, + asset_config=ASSET_CONFIG, + model_downloader: Callable[[str, str, int], str] | None = None, + downloader_num_retries=4, + ): + local_cache_path = asset_config.get_local_store_model_path( + model_id, model_asset_version, filename + ) + super().__init__( + url, + local_cache_path, + asset_config, + model_downloader, + downloader_num_retries, + ) + self.model_id = model_id + self.model_version = model_asset_version + + @staticmethod # type: ignore + def from_asset_store( + model_id: str, + model_asset_version: str | int, + filename: str, + num_retries=4, + asset_config=ASSET_CONFIG, + ): + """ + File from the online qaihm asset store. + + Parameters: + model_id: str + Model ID + + model_asset_version: str | int + Asset version for this model. + + num_retries: int + Number of retries when downloading thie file. + + asset_config: ModelZooAssetConfig + Asset config to use to save this file. + """ + web_store_path = asset_config.get_model_asset_url( + model_id, model_asset_version, filename + ) + return CachedWebModelAsset( + web_store_path, + model_id, + model_asset_version, + filename, + asset_config, + download_file, + num_retries, + ) + + @staticmethod # type: ignore + def from_google_drive( + gdrive_file_id: str, + model_id: str, + model_asset_version: str | int, + filename: str, + num_retries=4, + asset_config=ASSET_CONFIG, + ): + """ + File from google drive. + + Parameters: + gdrive_file_id: Unique identifier of the file in Google Drive. + Typically found in the URL. + + model_id: Model ID + + model_asset_version: Asset version for this model. + + filename: Filename for this asset on disk. + + num_retries: Number of retries when downloading thie file. + + asset_config: Asset config to use to save this file. + """ + return CachedWebModelAsset( + f"https://drive.google.com/uc?id={gdrive_file_id}", + model_id, + model_asset_version, + filename, + asset_config, + download_and_cache_google_drive, + num_retries, + ) + + +class CachedWebDatasetAsset(CachedWebAsset): + """ + Class representing dataset-specific files that needs stored in the local cache once downloaded. + + These files should correspond to a single (or group) of datasets in `qai_hub_models/dataset`. + """ + + def __init__( + self, + url: str, + dataset_id: str, + dataset_version: int | str, + filename: str, + asset_config=ASSET_CONFIG, + model_downloader: Callable[[str, str, int], str] | None = None, + downloader_num_retries=4, + ): + local_cache_path = asset_config.get_local_store_dataset_path( + dataset_id, dataset_version, filename + ) + super().__init__( + url, + local_cache_path, + asset_config, + model_downloader, + downloader_num_retries, + ) + self.dataset_id = dataset_id + self.dataset_version = dataset_version + + @staticmethod # type: ignore + def from_asset_store( + dataset_id: str, + dataset_version: str | int, + filename: str, + num_retries=4, + asset_config=ASSET_CONFIG, + ): + """ + File from the online qaihm asset store. + + Parameters: + model_id: Model ID + + dataset_version: Asset version for this model. + + num_retries: Number of retries when downloading thie file. + + asset_config: Asset config to use to save this file. + """ + web_store_path = asset_config.get_dataset_asset_url( + dataset_id, dataset_version, filename + ) + return CachedWebModelAsset( + web_store_path, + dataset_id, + dataset_version, + filename, + asset_config, + download_file, + num_retries, + ) + + @staticmethod # type: ignore + def from_google_drive( + gdrive_file_id: str, + model_id: str, + model_asset_version: str | int, + filename: str, + num_retries=4, + asset_config=ASSET_CONFIG, + ): + """ + File from google drive. + + Parameters: + gdrive_file_id: Unique identifier of the file in Google Drive. + Typically found in the URL. + + model_id: Model ID + + model_asset_version: Asset version for this model. + + filename: Filename for this asset on disk. + + num_retries: Number of retries when downloading thie file. + + asset_config: Asset config to use to save this file. + """ + return CachedWebModelAsset( + f"https://drive.google.com/uc?id={gdrive_file_id}", + model_id, + model_asset_version, + filename, + asset_config, + download_and_cache_google_drive, + num_retries, + ) + + +def download_file(web_url: str, dst_path: str, num_retries: int = 4) -> str: + """ + Downloads data from the internet and stores in `dst_folder`. + `dst_folder` should be relative to the local cache root for qai_hub_models. + """ + if not os.path.exists(dst_path): + print(f"Downloading data at {web_url} to {dst_path}... ", end="") + file_data = requests.get(web_url) + if file_data.status_code != 200: + raise ValueError(f"Unable to download file at {web_url}") + with open(dst_path, "wb") as dst_file: + dst_file.write(file_data.content) + print("Done") + return dst_path + + +def download_and_cache_google_drive(web_url: str, dst_path: str, num_retries: int = 4): + """ + Download file from google drive to the local directory. + + Parameters: + file_id: Unique identifier of the file in Google Drive. + Typically found in the URL. + model_name: Model for which this asset is being downloaded. + Used to choose where in the local filesystem to put it. + filename: Filename under which it will be saved locally. + num_retries: Number of times to retry in case download fails. + + Returns: + Filepath within the local filesystem. + """ + for i in range(num_retries): + print(f"Downloading data at {web_url} to {dst_path}... ") + try: + gdown.download(web_url, dst_path, quiet=False) + except Exception: + pass + if os.path.exists(dst_path): + print("Done") + return dst_path + else: + print(f"Failed to download file at {web_url}") + if i < num_retries - 1: + print("Retrying in 3 seconds.") + time.sleep(3) + return dst_path + + +def copyfile(src: str, dst: str, num_retries: int = 4): + if os.path.isdir(src): + shutil.copytree(src, dst) + else: + shutil.copyfile(src, dst) + return dst + + +def extract_zip_file(filepath_str: str) -> Path: + """ + Given a local filepath to a zip file, extract its contents into a folder + in the same directory. The directory with the contents will have the same + name as the .zip file without the `.zip` extention. + + Parameters: + filepath_str: String of the path to the zip file in the local directory. + """ + filepath = Path(filepath_str) + with ZipFile(filepath, "r") as zf: + out_path = filepath.parent / filepath.stem + zf.extractall(path=out_path) + return out_path + + +def callback_with_retry( + num_retries: int, + callback: Callable, + *args: Optional[Any], + **kwargs: Optional[Any], +) -> Any: + """Allow retries when running provided function.""" + if num_retries == 0: + raise RuntimeError(f"Unable to run function {callback.__name__}") + else: + try: + return callback(*args, **kwargs) + except Exception as error: + error_msg = ( + f"Error: {error.message}" # type: ignore + if hasattr(error, "message") + else f"Error: {str(error)}" + ) + print(error_msg) + if hasattr(error, "status_code"): + print(f"Status code: {error.status_code}") # type: ignore + time.sleep(10) + return callback_with_retry(num_retries - 1, callback, *args, **kwargs) + + +PathType = Union[str, Path, CachedWebAsset] diff --git a/qai_hub_models/utils/base_model.py b/qai_hub_models/utils/base_model.py new file mode 100644 index 00000000..e9365480 --- /dev/null +++ b/qai_hub_models/utils/base_model.py @@ -0,0 +1,227 @@ +from __future__ import annotations + +import os +from abc import ABC, ABCMeta, abstractmethod +from enum import Enum +from inspect import getmodule +from typing import Any, Dict, List, Type, TypeVar + +import numpy as np +import torch +from qai_hub.client import SourceModel + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.utils.input_spec import InputSpec, make_torch_inputs + +InputsType = Dict[str, List[np.ndarray]] + + +class TargetRuntime(Enum): + TFLITE = 0 + QNN = 1 + + +class SourceModelFormat(Enum): + ONNX = 0 + TORCHSCRIPT = 1 + + +class DocstringInheritorMeta(ABCMeta): + """ + Ensures that all subclasses retain the `forward` function's docstring. + """ + + def __new__(cls, name, bases, dct): + new_class = super().__new__(cls, name, bases, dct) + if hasattr(new_class, "forward"): + parent_method = getattr(bases[0], "forward", None) + if parent_method and new_class.forward.__doc__ is None: # type: ignore + new_class.forward.__doc__ = parent_method.__doc__ # type: ignore + return new_class + + +# Use this for typehints that take in a class and output an instance of the class. +FromPretrainedTypeVar = TypeVar("FromPretrainedTypeVar", bound="FromPretrainedMixin") +FromPrecompiledTypeVar = TypeVar("FromPrecompiledTypeVar", bound="FromPrecompiledMixin") + + +class FromPretrainedMixin(ABC): + @classmethod + @abstractmethod + def from_pretrained( + cls: Type[FromPretrainedTypeVar], *args, **kwargs + ) -> FromPretrainedTypeVar: + """ + Utility function that helps users get up and running with a default + pretrained model. While this function may take arguments, all arguments + should have default values specified, so that all classes can be invoked + with `cls.from_pretrained()` and always have it return something reasonable. + """ + pass + + +class CollectionModel(FromPretrainedMixin): + """ + Model that glues together several BaseModels + """ + + pass + + +class BaseModel( + torch.nn.Module, FromPretrainedMixin, ABC, metaclass=DocstringInheritorMeta +): + @abstractmethod + def get_input_spec(self, *args, **kwargs) -> InputSpec: + """ + Returns a map from `{input_name -> (shape, dtype)}` + specifying the shape and dtype for each input argument. + """ + pass + + @classmethod + def get_model_id(cls) -> str: + """ + Return model ID for this model. + The model ID is the same as the folder name for the model under qai_hub_models/models/... + """ + module = getmodule(cls) + if not module or not module.__file__: + raise ValueError(f"Unable to get model ID for {cls.__name__}") + + # Module path is always .../qai_hub_models/models//model.py + # Extract model ID from that path. + return os.path.basename(os.path.dirname(module.__file__)) + + def get_evaluator(self) -> BaseEvaluator: + """ + Gets default model output evaluator for this model. + """ + raise NotImplementedError("This model does not define a default evaluator.") + + def convert_to_torchscript( + self, input_spec: InputSpec | None = None, check_trace: bool = True + ) -> Any: + """ + Converts the torch module to a torchscript trace, which + is the format expected by qai hub. + + This is a default implementation that may be overriden by a subclass. + """ + if not input_spec: + input_spec = self.get_input_spec() + + return torch.jit.trace( + self, make_torch_inputs(input_spec), check_trace=check_trace + ) + + def convert_to_hub_source_model( + self, + target_runtime: TargetRuntime, + output_path: str, + input_spec: InputSpec | None = None, + check_trace: bool = True, + ) -> SourceModel: + """ + Convert to a AI Hub source model appropriate for the export method. + """ + # Local import to prevent circular dependency + from qai_hub_models.utils.inference import prepare_compile_zoo_model_to_hub + + assert isinstance(self, BaseModel) + source_model, _ = prepare_compile_zoo_model_to_hub( + self, + source_model_format=self.preferred_hub_source_model_format(target_runtime), + target_runtime=target_runtime, + output_path=output_path, + input_spec=input_spec, + check_trace=check_trace, + ) + return source_model + + def get_hub_compile_options( + self, + target_runtime: TargetRuntime, + other_compile_options: str = "", + ) -> str: + """ + Convert to a AI Hub source model appropriate for the export method. + """ + compile_options = "" + if target_runtime == TargetRuntime.QNN: + compile_options = "--target_runtime qnn_lib_aarch64_android" + if other_compile_options != "": + return compile_options + " " + other_compile_options + return compile_options + + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.TORCHSCRIPT + + def sample_inputs(self, input_spec: InputSpec | None = None) -> InputsType: + """ + Returns a set of sample inputs for the model. + + For each input name in the model, a list of numpy arrays is provided. + If the returned set is batch N, all input names must contain exactly N numpy arrays. + + This is a default implementation that returns a single random data array + for each input name based on the shapes and dtypes in `get_input_spec`. + + A subclass may choose to override this and fetch a batch of real input data + from a data source. + """ + if not input_spec: + input_spec = self.get_input_spec() + inputs_dict = {} + inputs_list = make_torch_inputs(input_spec) + for i, input_name in enumerate(input_spec.keys()): + inputs_dict[input_name] = [inputs_list[i].numpy()] + return inputs_dict + + +class FromPrecompiledMixin(ABC): + @classmethod + @abstractmethod + def from_precompiled( + cls: Type[FromPrecompiledTypeVar], *args, **kwargs + ) -> "FromPrecompiledTypeVar": + """ + Utility function that helps users get up and running with a default + precompiled model. While this function may take arguments, all arguments + should have default values specified, so that all classes can be invoked + with `cls.from_precompiled()` and always have it return something reasonable. + """ + pass + + +class BasePrecompiledModel(FromPrecompiledMixin): + @abstractmethod + def get_input_spec(self, *args, **kwargs) -> InputSpec: + """ + Returns a map from `{input_name -> (shape, dtype)}` + specifying the shape and dtype for each input argument. + """ + pass + + def sample_inputs(self, input_spec: InputSpec | None = None) -> InputsType: + """ + Returns a set of sample inputs for the model. + + For each input name in the model, a list of numpy arrays is provided. + If the returned set is batch N, all input names must contain exactly N numpy arrays. + + This is a default implementation that returns a single random data array + for each input name based on the shapes and dtypes in `get_input_spec`. + + A subclass may choose to override this and fetch a batch of real input data + from a data source. + """ + if not input_spec: + input_spec = self.get_input_spec() + inputs_dict = {} + inputs_list = make_torch_inputs(input_spec) + for i, input_name in enumerate(input_spec.keys()): + inputs_dict[input_name] = [inputs_list[i].numpy()] + return inputs_dict diff --git a/qai_hub_models/utils/bounding_box_processing.py b/qai_hub_models/utils/bounding_box_processing.py new file mode 100644 index 00000000..5cc02d39 --- /dev/null +++ b/qai_hub_models/utils/bounding_box_processing.py @@ -0,0 +1,261 @@ +from __future__ import annotations + +from typing import List, Tuple + +import cv2 +import numpy as np +import torch +from torchvision.ops import nms + + +def batched_nms( + iou_threshold: float, + score_threshold: float, + boxes: torch.Tensor, + scores: torch.Tensor, + *gather_additional_args, +) -> Tuple[List[torch.Tensor], ...]: + """ + Non maximum suppression over several batches. + + Inputs: + iou_threshold: float + Intersection over union (IoU) threshold + + score_threshold: float + Score threshold (throw away any boxes with scores under this threshold) + + boxes: torch.Tensor + Boxes to run NMS on. Shape is [B, N, 4], B == batch, N == num boxes, and 4 == (x1, x2, y1, y2) + + scores: torch.Tensor + Scores for each box. Shape is [B, N], range is [0:1] + + *gather_additional_args: torch.Tensor, ... + Additional tensor(s) to be gathered in the same way as boxes and scores. + In other words, each arg is returned with only the elements for the boxes selected by NMS. + Should be shape [B, N, ...] + + Outputs: + boxes_out: List[torch.Tensor] + Output boxes. This is list of tensors--one tensor per batch. + Each tensor is shape [S, 4], where S == number of selected boxes, and 4 == (x1, x2, y1, y2) + + boxes_out: List[torch.Tensor] + Output scores. This is list of tensors--one tensor per batch. + Each tensor is shape [S], where S == number of selected boxes. + + *args : List[torch.Tensor], ... + "Gathered" additional arguments, if provided. + """ + scores_out: List[torch.Tensor] = [] + boxes_out: List[torch.Tensor] = [] + args_out: List[List[torch.Tensor]] = ( + [[] for _ in gather_additional_args] if gather_additional_args else [] + ) + + for batch_idx in range(0, boxes.shape[0]): + # Clip outputs to valid scores + batch_scores = scores[batch_idx] + scores_idx = torch.nonzero(scores[batch_idx] >= score_threshold).squeeze(-1) + batch_scores = batch_scores[scores_idx] + batch_boxes = boxes[batch_idx, scores_idx] + batch_args = ( + [arg[batch_idx, scores_idx] for arg in gather_additional_args] + if gather_additional_args + else [] + ) + + if len(batch_scores > 0): + nms_indices = nms(batch_boxes[..., :4], batch_scores, iou_threshold) + batch_boxes = batch_boxes[nms_indices] + batch_scores = batch_scores[nms_indices] + batch_args = [arg[nms_indices] for arg in batch_args] + + boxes_out.append(batch_boxes) + scores_out.append(batch_scores) + for arg_idx, arg in enumerate(batch_args): + args_out[arg_idx].append(arg) + + return boxes_out, scores_out, *args_out + + +def compute_box_corners_with_rotation( + xc: torch.Tensor, + yc: torch.Tensor, + w: torch.Tensor, + h: torch.Tensor, + theta: torch.Tensor, +) -> torch.Tensor: + """ + From the provided information, compute the (x, y) coordinates of the box's corners. + + Inputs: + xc: torch.Tensor + Center of box (x). Shape is [ Batch ] + yc: torch.Tensor + Center of box (y). Shape is [ Batch ] + w: torch.Tensor + Width of box. Shape is [ Batch ] + h: torch.Tensor + Height of box. Shape is [ Batch ] + theta: torch.Tensor + Rotation of box (in radians). Shape is [ Batch ] + + Outputs: + corners: torch.Tensor + Computed corners. Shape is (B x 4 x 2), + where 2 == (x, y) + """ + batch_size = xc.shape[0] + + # Construct unit square + points = torch.tensor([[-1, -1, 1, 1], [-1, 1, -1, 1]], dtype=torch.float32).repeat( + batch_size, 1, 1 + ) # Construct Unit Square. Shape [B, 2, 4], where 2 == (X, Y) + points *= torch.stack((w / 2, h / 2), dim=-1).unsqueeze( + dim=2 + ) # Scale unit square to appropriate height and width + + # Rotate unit square to new coordinate system + R = torch.stack( + ( + torch.stack((torch.cos(theta), -torch.sin(theta)), dim=1), + torch.stack((torch.sin(theta), torch.cos(theta)), dim=1), + ), + dim=1, + ) # Construct rotation matrix + points = R @ points # Apply Rotation + + # Adjust box to center around the original center + points = points + torch.stack((xc, yc), dim=1).unsqueeze(dim=2) + + return points.transpose(-1, -2) + + +def compute_box_affine_crop_resize_matrix( + box_corners: torch.Tensor, output_image_size: Tuple[int, int] +) -> List[np.ndarray]: + """ + Computes the affine transform matrices required to crop, rescale, + and pad the box described by box_corners to fit into an image of the given size without warping. + + Inputs: + box_corners: torch.Tensor + Bounding box corners. These coordinates will be mapped to the output image. Shape is [B, 3, 2], + where B = batch, + 3 = (top left point, bottom left point, top right point) + and 2 = (x, y) + + output_image_size: float + Size of image to which the box should be resized and cropped. + + Outputs: + affines: List[np.ndarray] + Computed affine transform matrices. Shape is (2 x 3) + """ + # Define coordinates for translated image + network_input_points = np.array( + [[0, 0], [0, output_image_size[1] - 1], [output_image_size[0] - 1, 0]], + dtype=np.float32, + ) + + # Compute affine transformation that will map the square to the point + affines = [] + for batch in range(box_corners.shape[0]): + src = box_corners[batch][..., :3].detach().numpy() + affines.append(cv2.getAffineTransform(src, network_input_points)) + return affines + + +def box_xywh_to_xyxy(box_cwh: torch.Tensor) -> torch.Tensor: + """ + Convert center, W, H to top left / bottom right bounding box values. + + Inputs: + box_xy: torch.Tensor + Bounding box. Shape is [B, 2, 2] + [[xc, yc], [w, h]] * Batch + + Outputs: torch.Tensor + Output format is [[x0, y0], [x1, y1]] + """ + # Convert Xc, Yc, W, H to min and max bounding box values. + x_center = box_cwh[..., 0, 0] + y_center = box_cwh[..., 0, 1] + w = box_cwh[..., 1, 0] + h = box_cwh[..., 1, 1] + + out = torch.clone(box_cwh) + out[..., 0, 0] = x_center - w / 2.0 # x0 + out[..., 0, 1] = y_center - h / 2.0 # y0 + out[..., 1, 0] = x_center + w / 2.0 # x1 + out[..., 1, 1] = y_center + h / 2.0 # y1 + + return out + + +def box_xyxy_to_xywh( + box_xy: torch.Tensor, +) -> torch.Tensor: + """ + Converts box coordinates to center / width / height notation. + + Inputs: + box_xy: torch.Tensor + Bounding box. Shape is [B, 2, 2], + where B = batch, + 2 = (point 1, point 2), + and 2 = (x, y) + + Outputs: + box_cwh + Bounding box. Shape is [B, 2, 2], + [[xc, yc], [w, h]] * Batch + """ + x0 = box_xy[..., 0, 0] + y0 = box_xy[..., 0, 1] + x1 = box_xy[..., 1, 0] + y1 = box_xy[..., 1, 1] + + out = torch.clone(box_xy) + out[..., 1, 0] = x1 - x0 # w + out[..., 1, 1] = y1 - y0 # h + out[..., 0, 0] = x0 + out[..., 1, 0] / 2 # xc + out[..., 0, 1] = y0 + out[..., 1, 1] / 2 # yc + + return out + + +def apply_directional_box_offset( + offset: float | int | torch.Tensor, + vec_start: torch.Tensor, + vec_end: torch.Tensor, + xc: torch.Tensor, + yc: torch.Tensor, +): + """ + Offset the bounding box defined by [xc, yc] by a pre-determined length. + The offset will be applied in the direction of the supplied vector. + + Inputs: + offset: torch.Tensor + Floating point offset to apply to the bounding box, in absolute values. + vec_start: torch.Tensor + Starting point of the vector. Shape is [B, 2], where 2 == (x, y) + vec_end: torch.Tensor + Ending point of the vector. Shape is [B, 2], where 2 == (x, y) + xc: torch.Tensor + x center of box. + yc: torch.Tensor + y center of box + + Outputs: + No return value; xy and yc are modified in place. + """ + xlen = vec_end[..., 0] - vec_start[..., 0] + ylen = vec_end[..., 1] - vec_start[..., 1] + vec_len = torch.sqrt(torch.float_power(xlen, 2) + torch.float_power(ylen, 2)) + + xc += offset * (xlen / vec_len) + yc += offset * (ylen / vec_len) diff --git a/qai_hub_models/utils/camera_capture.py b/qai_hub_models/utils/camera_capture.py new file mode 100644 index 00000000..3e5526d9 --- /dev/null +++ b/qai_hub_models/utils/camera_capture.py @@ -0,0 +1,51 @@ +from typing import Callable + +import cv2 +import numpy as np + +ESCAPE_KEY_ID = 27 + + +def capture_and_display_processed_frames( + frame_processor: Callable[[np.ndarray], np.ndarray], + window_display_name: str, + cap_device: int = 0, +) -> None: + """ + Capture frames from the given input camera device, run them through + the frame processor, and display the outputs in a window with the given name. + + User should press Esc to exit. + + Inputs: + frame_processor: Callable[[np.ndarray], np.ndarray] + Processes frames. + Input and output are numpy arrays of shape (H W C) with BGR channel layout and dtype uint8 / byte. + window_display_name: str + Name of the window used to display frames. + cap_device: int + Identifier for the camera to use to capture frames. + """ + cv2.namedWindow(window_display_name) + capture = cv2.VideoCapture(cap_device) + if not capture.isOpened(): + raise ValueError("Unable to open video capture.") + + frame_count = 0 + has_frame, frame = capture.read() + while has_frame: + frame_count = frame_count + 1 + + # mirror frame + frame = np.ascontiguousarray(frame[:, ::-1, ::-1]) + + # process & show frame + processed_frame = frame_processor(frame) + cv2.imshow(window_display_name, processed_frame[:, :, ::-1]) + + has_frame, frame = capture.read() + key = cv2.waitKey(1) + if key == ESCAPE_KEY_ID: + break + + capture.release() diff --git a/qai_hub_models/utils/compare.py b/qai_hub_models/utils/compare.py new file mode 100644 index 00000000..c157e289 --- /dev/null +++ b/qai_hub_models/utils/compare.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from typing import Dict, List, NamedTuple, Tuple, Union + +import numpy as np +import torch + + +class InfenceMetrics(NamedTuple): + psnr: float + shape: Tuple[int, ...] + + +def torch_inference( + model: torch.nn.Module, sample_inputs: Dict[str, List[np.ndarray]] +) -> List[np.ndarray]: + """ + Performs inference on a torch model given a set of sample inputs. + + Parameters: + model: The torch model. + sample_inputs: Map from input name to list of values for that input. + + Returns: + List of numpy array outputs, + """ + torch_outs: List[List[torch.Tensor]] = [] + input_names = sample_inputs.keys() + for i in range(len(list(sample_inputs.values())[0])): + inputs = {} + for input_name in input_names: + inputs[input_name] = torch.from_numpy(sample_inputs[input_name][i]) + with torch.no_grad(): + out = model(**inputs) + out_tuple = (out,) if isinstance(out, torch.Tensor) else out + for i, out_val in enumerate(out_tuple): + if i == len(torch_outs): + torch_outs.append([]) + torch_outs[i].append(out_val) + return [torch.cat(out_list, dim=0).numpy() for out_list in torch_outs] + + +def compute_psnr( + output_a: Union[torch.Tensor, np.ndarray], + output_b: Union[torch.Tensor, np.ndarray], + eps: float = 1e-5, + eps2: float = 1e-10, +) -> float: + """ + Computes the PSNR between two tensors. + """ + if not isinstance(output_a, np.ndarray): + a = output_a.detach().numpy().flatten() + else: + a = output_a.flatten() + if not isinstance(output_b, np.ndarray): + b = output_b.detach().numpy().flatten() + else: + b = output_b.flatten() + max_b = np.abs(b).max() + sumdeltasq = 0.0 + sumdeltasq = ((a - b) * (a - b)).sum() + sumdeltasq /= b.size + sumdeltasq = np.sqrt(sumdeltasq) + + return 20 * np.log10((max_b + eps) / (sumdeltasq + eps2)) + + +def compute_relative_error(expected: np.ndarray, actual: np.ndarray) -> np.ndarray: + assert expected.shape == actual.shape + return (np.abs(expected - actual) / (np.abs(expected) + 1e-20)).flatten() + + +def compare_psnr( + output_a: Union[torch.Tensor, np.ndarray], + output_b: Union[torch.Tensor, np.ndarray], + psnr_threshold: int, + eps: float = 1e-5, + eps2: float = 1e-10, +) -> None: + """ + Raises an error if the PSNR between two tensors is above a threshold. + """ + psnr = compute_psnr(output_a, output_b, eps, eps2) + assert psnr > psnr_threshold + + +def generate_comparison_metrics( + expected: List[np.ndarray], actual: List[np.ndarray] +) -> Dict[int, InfenceMetrics]: + """ + Compares the outputs of a model run in two different ways. + For example, expected might be run on local cpu and actual run on device. + + Parameters: + expected: List of numpy array outputs computed from a ground truth model. + actual: List of numpy array outputs computed from an experimental model. + + Returns: + A set of metrics representing how close the two sets of outputs are. + """ + metrics = {} + for i, (expected_arr, actual_arr) in enumerate(zip(expected, actual)): + metrics[i] = InfenceMetrics( + compute_psnr(expected_arr, actual_arr), expected_arr.shape + ) + return metrics diff --git a/qai_hub_models/utils/config_loaders.py b/qai_hub_models/utils/config_loaders.py new file mode 100644 index 00000000..77dab6a2 --- /dev/null +++ b/qai_hub_models/utils/config_loaders.py @@ -0,0 +1,780 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +import requests +import yaml +from qai_hub.util.session import create_session +from schema import And +from schema import Optional as OptionalSchema +from schema import Schema, SchemaError + +from qai_hub_models.utils.asset_loaders import ASSET_CONFIG, QAIHM_WEB_ASSET +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.path_helpers import ( + MODELS_PACKAGE_NAME, + QAIHM_PACKAGE_NAME, + get_qaihm_models_root, + get_qaihm_package_root, +) + +QAIHM_PACKAGE_ROOT = get_qaihm_package_root() +QAIHM_MODELS_ROOT = get_qaihm_models_root() +QAIHM_DIRS = [ + Path(f.path) + for f in os.scandir(QAIHM_MODELS_ROOT) + if f.is_dir() and "info.yaml" in os.listdir(f) +] +MODEL_IDS = [f.name for f in QAIHM_DIRS] + +HF_AVAILABLE_LICENSES = { + "apache-2.0", + "mit", + "openrail", + "bigscience-openrail-m", + "creativeml-openrail-m", + "bigscience-bloom-rail-1.0", + "bigcode-openrail-m", + "afl-3.0", + "artistic-2.0", + "bsl-1.0", + "bsd", + "bsd-2-clause", + "bsd-3-clause", + "bsd-3-clause-clear", + "c-uda", + "cc", + "cc0-1.0", + "cc0-2.0", + "cc-by-2.5", + "cc-by-3.0", + "cc-by-4.0", + "cc-by-sa-3.0", + "cc-by-sa-4.0", + "cc-by-nc-2.0", + "cc-by-nc-3.0", + "cc-by-nc-4.0", + "cc-by-nd-4.0", + "cc-by-nc-nd-3.0", + "cc-by-nc-nd-4.0", + "cc-by-nc-sa-2.0", + "cc-by-nc-sa-3.0", + "cc-by-nc-sa-4.0", + "cdla-sharing-1.0", + "cdla-permissive-1.0", + "cdla-permissive-2.0", + "wtfpl", + "ecl-2.0", + "epl-1.0", + "epl-2.0", + "etalab-2.0", + "agpl-3.0", + "gfdl", + "gpl", + "gpl-2.0", + "gpl-3.0", + "lgpl", + "lgpl-2.1", + "lgpl-3.0", + "isc", + "lppl-1.3c", + "ms-pl", + "mpl-2.0", + "odc-by", + "odbl", + "openrail++", + "osl-3.0", + "postgresql", + "ofl-1.1", + "ncsa", + "unlicense", + "zlib", + "pddl", + "lgpl-lr", + "deepfloyd-if-license", + "llama2", + "unknown", + "other", +} + + +class FORM_FACTOR(Enum): + PHONE = 0 + TABLET = 1 + IOT = 2 + XR = 3 + + @staticmethod + def from_string(string: str) -> "FORM_FACTOR": + return FORM_FACTOR[string.upper()] + + def __str__(self): + if self == FORM_FACTOR.IOT: + return "IoT" + return self.name.title() + + +class MODEL_DOMAIN(Enum): + COMPUTER_VISION = 0 + AUDIO = 1 + MULTIMODAL = 2 + GENERATIVE_AI = 3 + + @staticmethod + def from_string(string: str) -> "MODEL_DOMAIN": + return MODEL_DOMAIN[string.upper().replace(" ", "_")] + + def __str__(self): + return self.name.title().replace("_", " ") + + +class MODEL_TAG(Enum): + BACKBONE = 0 + REAL_TIME = 1 + FOUNDATION = 2 + QUANTIZED = 3 + LLM = 4 + GENERATIVE_AI = 5 + + @staticmethod + def from_string(string: str) -> "MODEL_TAG": + assert "_" not in string + return MODEL_TAG[string.upper().replace("-", "_")] + + def __str__(self) -> str: + return self.name.replace("_", "-").lower() + + def __repr__(self) -> str: + return self.__str__() + + +class MODEL_STATUS(Enum): + PUBLIC = 0 + PRIVATE = 1 + # proprietary models are released only internally + PROPRIETARY = 2 + + @staticmethod + def from_string(string: str) -> "MODEL_STATUS": + return MODEL_STATUS[string.upper()] + + def __str__(self): + return self.name + + +class MODEL_USE_CASE(Enum): + # Image: 100 - 199 + IMAGE_CLASSIFICATION = 100 + IMAGE_EDITING = 101 + IMAGE_GENERATION = 102 + SUPER_RESOLUTION = 103 + SEMANTIC_SEGMENTATION = 104 + # Ex: OCR, image caption + IMAGE_TO_TEXT = 105 + OBJECT_DETECTION = 106 + POSE_ESTIMATION = 107 + + # Audio: 200 - 299 + SPEECH_RECOGNITION = 200 + AUDIO_ENHANCEMENT = 201 + + # Video: 300 - 399 + VIDEO_CLASSIFICATION = 300 + VIDEO_GENERATION = 301 + + # LLM: 400 - 499 + TEXT_GENERATION = 400 + + @staticmethod + def from_string(string: str) -> "MODEL_USE_CASE": + return MODEL_USE_CASE[string.upper().replace(" ", "_")] + + def __str__(self): + return self.name.replace("_", " ").title() + + def map_to_hf_pipeline_tag(self): + """Map our usecase to pipeline-tag used by huggingface.""" + if self.name in {"IMAGE_EDITING", "SUPER_RESOLUTION"}: + return "image-to-image" + if self.name == "SEMANTIC_SEGMENTATION": + return "image-segmentation" + if self.name == "POSE_ESTIMATION": + return "image-classification" + if self.name == "AUDIO_ENHANCEMENT": + return "audio-to-audio" + if self.name == "VIDEO_GENERATION": + return "image-to-video" + if self.name == "IMAGE_GENERATION": + return "unconditional-image-generation" + if self.name == "SPEECH_RECOGNITION": + return "automatic-speech-recognition" + return self.name.replace("_", "-").lower() + + +TFLITE_PATH = "torchscript_onnx_tflite" +QNN_PATH = "torchscript_onnx_qnn" + + +class QAIHMModelPerf: + """Class to read the perf.yaml and parse it for displaying it on HuggingFace.""" + + @dataclass + class ModelRuntimePerformanceDetails: + model_name: str + device_name: str + device_os: str + runtime: TargetRuntime + inference_time_ms: int + peak_memory_bytes: Tuple[int, int] # min, max + compute_unit_counts: Dict[str, int] + + def __init__(self, perf_yaml_path, model_name): + self.model_name = model_name + self.perf_yaml_path = perf_yaml_path + self.skip_overall = False + self.skip_tflite = False + self.skip_qnn = False + self.tflite_row = ( + "| Samsung Galaxy S23 Ultra (Android 13) | Snapdragon® 8 Gen 2 |" + ) + self.qnn_row = "| Samsung Galaxy S23 Ultra (Android 13) | Snapdragon® 8 Gen 2 |" + + if os.path.exists(self.perf_yaml_path): + with open(self.perf_yaml_path, "r") as perf_file: + self.perf_details = yaml.safe_load(perf_file) + num_models = len(self.perf_details["models"]) + + # Get TFLite summary from perf.yaml + try: + self.tflite_summary = [] + for model in self.perf_details["models"]: + self.tflite_summary.append( + model["performance_metrics"][0][TFLITE_PATH] + ) + except Exception: + self.skip_tflite = True + + if not self.skip_overall and not self.skip_tflite: + for num in range(num_models): + if isinstance(self.tflite_summary[num]["inference_time"], str): + self.skip_tflite = True + + # Get QNN summary from perf.yaml + try: + self.qnn_summary = [] + for model in self.perf_details["models"]: + self.qnn_summary.append( + model["performance_metrics"][0][QNN_PATH] + ) + except Exception: + self.skip_qnn = True + if not self.skip_overall and not self.skip_qnn: + for num in range(num_models): + if isinstance(self.qnn_summary[num]["inference_time"], str): + self.skip_qnn = True + else: + self.skip_overall = True + + def _get_runtime_type(self, model_type): + if model_type == "tflite": + return "TFLite" + if model_type == "so": + return "QNN Model Library" + if model_type == "bin": + return "QNN Binary" + raise RuntimeError(f"Unsupported model_type specified {model_type}.") + + def get_row(self, skip, summary_list, initial_row, model_type, has_assets=True): + # Creating a row for performance table. + row = "" + if not skip: + names = self.get_submodel_names() + for summary, name in zip(summary_list, names): + inf_time = summary["inference_time"] + inference_time = f"{inf_time / 1000} ms" + mem_min = round( + summary["estimated_peak_memory_range"]["min"] / 1024 / 1024 + ) + mem_max = round( + summary["estimated_peak_memory_range"]["max"] / 1024 / 1024 + ) + peak_memory_range = f"{mem_min} - {mem_max} MB" + if model_type == "tflite": + self.tflite_inference_time = inference_time + self.tflite_peak_memory_range = peak_memory_range + elif model_type == "so" or model_type == "bin": + self.qnn_inference_time = inference_time + self.qnn_peak_memory_range = peak_memory_range + primary_compute_unit = summary["primary_compute_unit"] + precision = summary["precision"].upper() + base_url = ASSET_CONFIG.get_hugging_face_url(self.model_name) + # For no_assets models, only show model name and no-link + # as there is not target model to download + if has_assets: + target_model = f" [{name}.{model_type}]({base_url}/blob/main/{name}.{model_type})" + else: + target_model = name + + runtime_type = self._get_runtime_type(model_type) + row += ( + initial_row + + f" {runtime_type} | {inference_time} | {peak_memory_range} | {precision} | {primary_compute_unit} | {target_model} \n" + ) + return row + return "" + + def get_tflite_row(self): + # Get TFLite row for a submodel on a device. + return self.get_row( + self.skip_tflite, self.tflite_summary, self.tflite_row, "tflite" + ) + + def get_qnn_row(self, is_precompiled: bool = False, has_assets=True): + # Get QNN row for a submodel on a device. + return self.get_row( + self.skip_qnn, + self.qnn_summary, + self.qnn_row, + "bin" if is_precompiled else "so", + has_assets, + ) + + def body_perf(self, is_precompiled: bool = False, has_assets: bool = True): + # Combine all the rows to make the body of performance table. + if self.skip_tflite: + return self.get_qnn_row(is_precompiled, has_assets) + elif self.skip_qnn: + return self.get_tflite_row() + else: + return self.get_tflite_row() + self.get_qnn_row(is_precompiled, has_assets) + + def compute_unit_summary(self, runtime_path=TFLITE_PATH): + # Get compute unit summary for export script's output. + npu, gpu, cpu = 0, 0, 0 + cu_summary = "" + for model in self.perf_details["models"]: + layer_info = model["performance_metrics"][0][runtime_path]["layer_info"] + npu += layer_info["layers_on_npu"] + gpu += layer_info["layers_on_gpu"] + cpu += layer_info["layers_on_cpu"] + if npu > 0: + cu_summary += f"NPU ({npu})" + if gpu > 0: + cu_summary += f"GPU ({gpu})" + if cpu > 0: + cu_summary += f"CPU ({cpu})" + return cu_summary + + def get_submodel_names_and_ids(self): + # Get the names, TFLite job ids and QNN job ids. + names = self.get_submodel_names() + tflite_job_ids, qnn_job_ids = [], [] + for model in self.perf_details["models"]: + if TFLITE_PATH in model["performance_metrics"][0]: + tflite_job_ids.append( + model["performance_metrics"][0][TFLITE_PATH]["job_id"] + ) + if QNN_PATH in model["performance_metrics"][0]: + qnn_job_ids.append(model["performance_metrics"][0][QNN_PATH]["job_id"]) + return names, tflite_job_ids, qnn_job_ids + + def get_submodel_names(self): + # Get names of all the submodels. + names = [] + for model in self.perf_details["models"]: + names.append(model["name"]) + return names + + def get_perf_details( + self, + runtime: TargetRuntime, + device: str | None = None, + device_os: str | None = None, + ) -> Dict[str, ModelRuntimePerformanceDetails | None]: + """ + Get model performance details for the selected device and runtime. + + If device is None, picks the first device specified in the perf results. + + Returns a dictionary of + { model_component_name : performance details object } + + If there is only one component, model_component_name == model_name. + + The performance details object will be null if the requested + perf details do not exist, or if the perf job failed. + """ + if runtime == TargetRuntime.TFLITE: + rt_name = "torchscript_onnx_tflite" + elif runtime == TargetRuntime.QNN: + rt_name = "torchscript_onnx_qnn" + else: + raise NotImplementedError() + + # Model -> Performance Details + # None == Test did not run. + perf_details: Dict[ + str, QAIHMModelPerf.ModelRuntimePerformanceDetails | None + ] = {} + + for model in self.perf_details["models"]: + name = model["name"] + metrics = model["performance_metrics"] + for device_metrics in metrics: + device_name = device_metrics["reference_device_info"]["name"] + metric_device_os = device_metrics["reference_device_info"]["os"] + + # Verify Device Matches Requested Device + if device and device_name != device: + continue + if device_os and metric_device_os != device_os: + continue + + perf_rt = device_metrics.get(rt_name, None) + + # Inference Time + inf_time = perf_rt["inference_time"] if perf_rt else "null" + if inf_time == "null": + # Compilation or inference failed. + perf_details[name] = None + continue + inf_time /= 1000 + + # Memory + peak_mem = perf_rt["estimated_peak_memory_range"] + peak_mem_bytes: Tuple[int, int] = tuple([peak_mem["min"], peak_mem["max"]]) # type: ignore + + # Layer Info + layer_info = perf_rt["layer_info"] + compute_unit_counts = {} + for layer_name, count in layer_info.items(): + if "layers_on" in layer_name: + if count > 0: + compute_unit_counts[layer_name[-3:].upper()] = count + + perf_details[name] = QAIHMModelPerf.ModelRuntimePerformanceDetails( + model_name=model, + device_name=device_name, + device_os=metric_device_os, + runtime=runtime, + inference_time_ms=inf_time, + peak_memory_bytes=peak_mem_bytes, + compute_unit_counts=compute_unit_counts, + ) + + if name not in perf_details.keys(): + perf_details[name] = None + + return perf_details + + +class QAIHMModelInfo: + def __init__( + self, + name: str, + id: str, + status: MODEL_STATUS, + headline: str, + domain: MODEL_DOMAIN, + description: str, + use_case: MODEL_USE_CASE, + tags: List[MODEL_TAG], + research_paper: str, + research_paper_title: str, + license: str, + source_repo: str, + applicable_scenarios: List[str], + related_models: List[str], + form_factors: List[FORM_FACTOR], + has_static_banner: bool, + has_animated_banner: bool, + code_gen_config: Dict[str, str | bool], + license_type: str, + dataset: List[str], + technical_details: Dict[str, str], + ) -> None: + self.name = name + self.id = id + self.status = status + self.headline = headline + self.domain = domain + self.description = description + self.use_case = use_case + self.tags = tags + self.research_paper = research_paper + self.research_paper_title = research_paper_title + self.license = license + self.license_type = license_type + self.dataset = dataset + self.source_repo = source_repo + self.applicable_scenarios = applicable_scenarios + self.related_models = related_models + self.form_factors = form_factors + self.has_static_banner = has_static_banner + self.has_animated_banner = has_animated_banner + self.code_gen_config = code_gen_config + self.technical_details = technical_details + + def validate(self) -> Tuple[bool, Optional[str]]: + """Returns false with a reason if the info spec for this model is not valid.""" + # Validate ID + if self.id not in MODEL_IDS: + return False, f"{self.id} is not a valid QAI Hub Models ID." + if " " in self.id or "-" in self.id: + return False, "Model IDs cannot contain spaces or dashes." + if self.id.lower() != self.id: + return False, "Model IDs must be lowercase." + + # Validate (used as repo name for HF as well) + if " " in self.name: + return False, "Model Name must not have a space." + + # Headline should end with period + if not self.headline.endswith("."): + return False, "Model headlines must end with a period." + + # Quantized models must contain quantized tag + if ("quantized" in self.id) and (MODEL_TAG.QUANTIZED not in self.tags): + return False, f"Quantized models must have quantized tag. tags: {self.tags}" + if ("quantized" not in self.id) and (MODEL_TAG.QUANTIZED in self.tags): + return ( + False, + f"Models with a quantized tag must have 'quantized' in the id. tags: {self.tags}", + ) + + # Validate related models are present + for r_model in self.related_models: + if r_model not in MODEL_IDS: + return False, f"Related model {r_model} is not a valid model ID." + if r_model == self.id: + return False, f"Model {r_model} cannot be related to itself." + + # If paper is arxiv, it should be an abs link + if self.research_paper.startswith("https://arxiv.org/"): + if "/abs/" not in self.research_paper: + return ( + False, + "Arxiv links should be `abs` links, not link directly to pdfs.", + ) + + # If license_type does not match the map, return an error + if self.license_type not in HF_AVAILABLE_LICENSES: + return False, f"license can be one of these: {HF_AVAILABLE_LICENSES}" + + # Web assets exist + if self.status == MODEL_STATUS.PUBLIC and not self.has_static_banner: + return False, "All public models must have a static banner." + + # Required assets exist + if self.status == MODEL_STATUS.PUBLIC: + if not os.path.exists(self.get_package_path() / "info.yaml"): + return False, "All public models must have an info.yaml" + + if self.code_gen_config.get( + "tflite_export_failure_reason", False + ) and self.code_gen_config.get("qnn_export_failure_reason", False): + return False, "Public models must support at least one export path" + + session = create_session() + if self.has_static_banner: + static_banner_url = ASSET_CONFIG.get_web_asset_url( + self.id, QAIHM_WEB_ASSET.STATIC_IMG + ) + if session.head(static_banner_url).status_code != requests.codes.ok: + return False, f"Static banner is missing at {static_banner_url}" + if self.has_animated_banner: + animated_banner_url = ASSET_CONFIG.get_web_asset_url( + self.id, QAIHM_WEB_ASSET.ANIMATED_MOV + ) + if session.head(animated_banner_url).status_code != requests.codes.ok: + return False, f"Animated banner is missing at {animated_banner_url}" + + expected_qaihm_repo = f"qai_hub_models/models/{self.id}" + if expected_qaihm_repo != ASSET_CONFIG.get_qaihm_repo(self.id): + return False, "QAIHM repo not pointing to expected relative path" + + expected_example_use = f"qai_hub_models/models/{self.id}#example--usage" + if expected_example_use != ASSET_CONFIG.get_example_use(self.id): + return False, "Example-usage field not pointing to expected relative path" + + return True, None + + def get_package_name(self): + return f"{QAIHM_PACKAGE_NAME}.{MODELS_PACKAGE_NAME}.{self.id}" + + def get_package_path(self, root: Path = QAIHM_PACKAGE_ROOT): + return get_qaihm_models_root(root) / self.id + + def get_model_definition_path(self): + return os.path.join( + ASSET_CONFIG.get_qaihm_repo(self.id, relative=False), "model.py" + ) + + def get_demo_path(self): + return os.path.join( + ASSET_CONFIG.get_qaihm_repo(self.id, relative=False), "demo.py" + ) + + def get_info_yaml_path(self, root: Path = QAIHM_PACKAGE_ROOT): + return self.get_package_path(root) / "info.yaml" + + def get_hf_pipeline_tag(self): + return self.use_case.map_to_hf_pipeline_tag() + + def get_hugging_face_metadata(self, root: Path = QAIHM_PACKAGE_ROOT): + # Get the metadata for huggingface model cards. + hf_metadata: Dict[str, Union[str, List[str]]] = dict() + hf_metadata["library_name"] = "pytorch" + hf_metadata["license"] = self.license_type + hf_metadata["tags"] = [tag.name.lower() for tag in self.tags] + ["android"] + if self.dataset != []: + hf_metadata["datasets"] = self.dataset + hf_metadata["pipeline_tag"] = self.get_hf_pipeline_tag() + return hf_metadata + + def get_model_details(self): + # Model details. + details = ( + "- **Model Type:** " + + self.use_case.__str__().lower().capitalize() + + "\n- **Model Stats:**" + ) + for name, val in self.technical_details.items(): + details += f"\n - {name}: {val}" + return details + + def get_perf_yaml_path(self, root: Path = QAIHM_PACKAGE_ROOT): + return self.get_package_path(root) / "perf.yaml" + + def get_code_gen_yaml_path(self, root: Path = QAIHM_PACKAGE_ROOT): + return self.get_package_path(root) / "code-gen.yaml" + + def get_readme_path(self, root: Path = QAIHM_PACKAGE_ROOT): + return self.get_package_path(root) / "README.md" + + def get_hf_model_card_path(self, root: Path = QAIHM_PACKAGE_ROOT): + return self.get_package_path(root) / "HF_MODEL_CARD.md" + + def get_requirements_path(self, root: Path = QAIHM_PACKAGE_ROOT): + return self.get_package_path(root) / "requirements.txt" + + def has_model_requirements(self, root: Path = QAIHM_PACKAGE_ROOT): + return os.path.exists(self.get_requirements_path(root)) + + @staticmethod + def from_model(model_id: str): + schema_path = QAIHM_MODELS_ROOT / model_id / "info.yaml" + code_gen_path = QAIHM_MODELS_ROOT / model_id / "code-gen.yaml" + if not os.path.exists(schema_path): + raise ValueError(f"{model_id} does not exist") + return QAIHMModelInfo.from_yaml(schema_path, code_gen_path) + + @staticmethod + def from_yaml(info_path: str | Path, code_gen_path: str | Path | None = None): + # Load CFG and params + info_yaml = QAIHMModelInfo.load_info_yaml(info_path) + code_gen_config = QAIHMModelInfo.load_code_gen_yaml(code_gen_path) + return QAIHMModelInfo( + info_yaml["name"], + info_yaml["id"], + MODEL_STATUS.from_string(info_yaml["status"]), + info_yaml["headline"], + MODEL_DOMAIN.from_string(info_yaml["domain"]), + info_yaml["description"], + MODEL_USE_CASE.from_string(info_yaml["use_case"]), + [MODEL_TAG.from_string(tag) for tag in info_yaml["tags"]], + info_yaml["research_paper"], + info_yaml["research_paper_title"], + info_yaml["license"], + info_yaml["source_repo"], + info_yaml["applicable_scenarios"], + info_yaml["related_models"], + [FORM_FACTOR.from_string(ff) for ff in info_yaml["form_factors"]], + info_yaml["has_static_banner"], + info_yaml["has_animated_banner"], + code_gen_config, + info_yaml["license_type"], + info_yaml["dataset"], + info_yaml["technical_details"], + ) + + # Schema for info.yaml + INFO_YAML_SCHEMA = Schema( + { + "name": And(str), + "id": And(str), + "status": And(str), + "headline": And(str), + "domain": And(str), + "description": And(str), + "use_case": And(str), + "tags": And(lambda s: len(s) >= 0), + "research_paper": And(str), + "research_paper_title": And(str), + "license": And(str), + "source_repo": And(str), + "technical_details": And(dict), + "applicable_scenarios": And(lambda s: len(s) >= 0), + "related_models": And(lambda s: len(s) >= 0), + "form_factors": And(lambda s: len(s) >= 0), + "has_static_banner": And(bool), + "has_animated_banner": And(bool), + "license_type": And(str), + "dataset": And(list), + } + ) + + # Schema for code-gen.yaml + CODE_GEN_YAML_SCHEMA = Schema( + And( + { + OptionalSchema("has_components", default=""): str, + OptionalSchema("is_aimet", default=False): bool, + OptionalSchema("has_on_target_demo", default=False): bool, + OptionalSchema("qnn_export_failure_reason", default=""): str, + OptionalSchema("tflite_export_failure_reason", default=""): str, + OptionalSchema("has_demo", default=True): bool, + OptionalSchema("check_trace", default=True): bool, + OptionalSchema("default_profile_options", default=""): str, + OptionalSchema("default_compile_options", default=""): str, + OptionalSchema("channel_last_input", default=""): str, + OptionalSchema("channel_last_output", default=""): str, + OptionalSchema("outputs_to_skip_validation", default=[]): list, + OptionalSchema("export_test_model_kwargs", default={}): dict, + OptionalSchema("components", default={}): dict, + OptionalSchema("default_components", default=[]): list, + OptionalSchema("skip_tests", default=False): bool, + OptionalSchema("is_precompiled", default=False): bool, + OptionalSchema("no_assets", default=False): bool, + OptionalSchema("torchscript_opt", default=[]): list, + } + ) + ) + + @staticmethod + def load_info_yaml(path: str | Path): + with open(path) as f: + data = yaml.safe_load(f) + try: + # Validate high level-schema + data = QAIHMModelInfo.INFO_YAML_SCHEMA.validate(data) + except SchemaError as e: + assert 0, f"{e.code} in {path}" + return data + + @staticmethod + def load_code_gen_yaml(path: str | Path | None): + if not path or not os.path.exists(path): + return QAIHMModelInfo.CODE_GEN_YAML_SCHEMA.validate({}) # Default Schema + with open(path) as f: + data = yaml.safe_load(f) + try: + # Validate high level-schema + data = QAIHMModelInfo.CODE_GEN_YAML_SCHEMA.validate(data) + except SchemaError as e: + assert 0, f"{e.code} in {path}" + return data diff --git a/qai_hub_models/utils/display.py b/qai_hub_models/utils/display.py new file mode 100644 index 00000000..c7566e03 --- /dev/null +++ b/qai_hub_models/utils/display.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import Optional + +from PIL.Image import Image +from PIL.ImageShow import IPythonViewer, _viewers # type: ignore + +ALWAYS_DISPLAY_VAR = "QAIHM_ALWAYS_DISPLAY_OUTPUT" + + +def is_running_in_notebook(): + try: + from IPython import get_ipython + + if "IPKernelApp" not in get_ipython().config: # pragma: no cover + return False + except ImportError: + return False + except AttributeError: + return False + return True + + +def save_image(image: Image, base_dir: str, filename: str, desc: str): + os.makedirs(base_dir, exist_ok=True) + filename = os.path.join(base_dir, filename) + image.save(filename) + print(f"Saving {desc} to {filename}") + + +def display_image(image: Image, desc: str = "image") -> bool: + """ + Attempt to display image. + Return true if displaying was attempted without exceptions. + """ + # Display IPython viewer first + # Remote server notebooks will be caught here as well + if is_running_in_notebook(): + for viewer in _viewers: + if isinstance(viewer, IPythonViewer): + viewer.show(image) + return True + + try: + if os.environ.get(ALWAYS_DISPLAY_VAR) == "1" or not ( + os.environ.get("SSH_TTY") or os.environ.get("SSH_CLIENT") + ): + print(f"Displaying {desc}") + image.show() + return True + else: + print( + "\nDemo image display is disabled by default for remote servers. " + f"To override, set `{ALWAYS_DISPLAY_VAR}=1` in your environment.\n" + ) + except Exception: + print("Failure to display demo images displayed on screen.") + print( + "If you are using a notebook environment like Jupyter/Collab, please use %run -m to run the script instead of python -m." + ) + return False + + +def display_or_save_image( + image: Image, + output_dir: Optional[str] = None, + filename: str = "image.png", + desc: str = "image", +) -> bool: + """ + If output_dir is set, save image to disk and return. + Else try to display image. + If displaying image fails, save to disk in a default location. + + Parameters: + image: PIL Image to save. + output_dir: If set, saves image to this directory. + filename: If saving to directory, the filename to use. + desc: Description of what the image is, used in a print statement. + + Returns: + True if displaying was attempted. + """ + if output_dir is not None: + save_image(image, output_dir, filename, desc) + return False + + if display_image(image, desc): + return True + + save_image(image, str(Path.cwd() / "build"), filename, desc) + return False diff --git a/qai_hub_models/utils/draw.py b/qai_hub_models/utils/draw.py new file mode 100644 index 00000000..ded6773b --- /dev/null +++ b/qai_hub_models/utils/draw.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +from typing import List, Optional, Tuple + +import cv2 +import numpy +import torch + + +def draw_points( + frame: numpy.ndarray, + points: numpy.ndarray | torch.Tensor, + color: Tuple[int, int, int] = (0, 0, 0), + size: int = 3, +): + """ + Draw the given points on the frame. + + Parameters: + frame: numpy.ndarray + numpy array (H W C x uint8, BGR) + + points: numpy.ndarray | torch.Tensor + array (N, 2) where layout is + [x1, y1] [x2, y2], ... + or + array (N * 2,) where layout is + x1, y1, x2, y2, ... + + color: Tuple[int, int, int] + Color of drawn points (RGB) + + size: int + Size of drawn points + + Returns: + None; modifies frame in place. + """ + n2 = len(points.shape) == 2 + for i in range(0, len(points) if n2 else len(points) // 2): + x, y = points[i] if n2 else (points[i * 2], points[i * 2 + 1]) + cv2.circle(frame, (int(x), int(y)), size, color, thickness=size) + + +def draw_connections( + frame: numpy.ndarray, + points: numpy.ndarray | torch.Tensor, + connections: List[Tuple[int, int]], + color: Tuple[int, int, int] = (0, 0, 0), + size: int = 3, +): + """ + Draw connecting lines between the given points on the frame. + + Parameters: + frame: numpy.ndarray + numpy array (H W C x uint8, BGR) + + points: numpy.ndarray | torch.Tensor + array (N, 2) where layout is + [x1, y1] [x2, y2], ... + or + array (N * 2,) where layout is + x1, y1, x2, y2, ... + + connections: List[Tuple[int, int]] + List of points that should be connected by a line. + Format is [(src point index, dst point index), ...] + + color: Tuple[int, int, int] + Color of drawn points (RGB) + + size: int + Size of drawn connection lines + + Returns: + None; modifies frame in place. + """ + n2 = len(points.shape) == 2 + for connection in connections: + x0, y0 = ( + points[connection[0]] + if n2 + else (points[connection[0] * 2], points[connection[0] * 2 + 1]) + ) + x1, y1 = ( + points[connection[1]] + if n2 + else (points[connection[1] * 2], points[connection[1] * 2 + 1]) + ) + x0, y0 = int(x0), int(y0) + x1, y1 = int(x1), int(y1) + cv2.line(frame, (x0, y0), (x1, y1), color, size) + + +def draw_box_from_corners( + frame: numpy.ndarray, corners: numpy.ndarray | torch.Tensor, color=(0, 0, 0), size=3 +): + """ + Draw a box using the 4 points provided as boundaries. + + Parameters: + frame: numpy.ndarray + numpy array (H W C x uint8, BGR) + + corners: numpy.ndarray | torch.Tensor + array (4, 2) where layout is + [x1, y1] [x2, y2], ... + or + array (8) where layout is + x1, y1, x2, y2 + + color: Tuple[int, int, int] + Color of drawn points and connection lines (BGR) + + size: int + Size of drawn points and connection lines + + Returns: + None; modifies frame in place. + """ + draw_points(frame, corners, color, size) + draw_connections(frame, corners, [(0, 1), (0, 2), (1, 3), (2, 3)], color, size) + + +def draw_box_from_xywh( + frame: numpy.ndarray, + box: numpy.ndarray | torch.Tensor, + color: Tuple[int, int, int] = (0, 0, 0), + size: int = 3, +): + """ + Draw a box using the provided data (center / height / width) to compute the box. + + Parameters: + frame: numpy.ndarray + numpy array (H W C x uint8, BGR) + + box: numpy.ndarray | torch.Tensor + array (4), where layout is + [xcenter, ycenter, h, w] + + color: Tuple[int, int, int] + Color of drawn points and connection lines (RGB) + + size: int + Size of drawn points and connection lines + + Returns: + None; modifies frame in place. + """ + xc, yc, h, w = box + TL = [xc - w // 2, yc - h // 2] + BR = [xc + w // 2, yc + h // 2] + cv2.rectangle(frame, TL, BR, color, size) + + +def draw_box_from_xyxy( + frame: numpy.ndarray, + top_left: numpy.ndarray | torch.Tensor | Tuple[int, int], + bottom_right: numpy.ndarray | torch.Tensor | Tuple[int, int], + color: Tuple[int, int, int] = (0, 0, 0), + size: int = 3, + text: Optional[str] = None, +): + """ + Draw a box using the provided top left / bottom right points to compute the box. + + Parameters: + frame: numpy.ndarray + numpy array (H W C x uint8, BGR) + + box: numpy.ndarray | torch.Tensor + array (4), where layout is + [xc, yc, h, w] + + color: Tuple[int, int, int] + Color of drawn points and connection lines (RGB) + + size: int + Size of drawn points and connection lines BGR channel layout + + text: None | str + Overlay text at the top of the box. + + Returns: + None; modifies frame in place. + """ + if not isinstance(top_left, tuple): + top_left = (int(top_left[0].item()), int(top_left[1].item())) + if not isinstance(bottom_right, tuple): + bottom_right = (int(bottom_right[0].item()), int(bottom_right[1].item())) + cv2.rectangle(frame, top_left, bottom_right, color, size) + if text is not None: + cv2.putText( + frame, + text, + (top_left[0], top_left[1] - 10), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + color, + size, + ) + + +def create_color_map(num_classes): + """ + Assign a random color to each class in the dataset to produce a segmentation mask for drawing. + + Inputs: + num_classes: Number of colors to produce. + + Returns: + A list of `num_classes` colors in RGB format. + """ + numpy.random.seed(42) # For reproducible results + color_map = numpy.random.randint(0, 256, size=(num_classes, 3), dtype=numpy.uint8) + color_map[0] = [0, 0, 0] # Background class, usually black + return color_map diff --git a/qai_hub_models/utils/huggingface.py b/qai_hub_models/utils/huggingface.py new file mode 100644 index 00000000..7bb5a77a --- /dev/null +++ b/qai_hub_models/utils/huggingface.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import List + +from huggingface_hub import HfFileSystem, hf_hub_download + +from qai_hub_models.utils.asset_loaders import ASSET_CONFIG, ModelZooAssetConfig +from qai_hub_models.utils.base_model import TargetRuntime + + +def fetch_huggingface_target_model( + model_name: str, + dst_folder: str | Path, + runtime_path: TargetRuntime = TargetRuntime.TFLITE, + config: ModelZooAssetConfig = ASSET_CONFIG, +) -> List[str]: + fs = HfFileSystem() + hf_path = config.get_huggingface_path(model_name) + + if runtime_path == TargetRuntime.TFLITE: + file_types = ["tflite"] + elif runtime_path == TargetRuntime.QNN: + file_types = ["so", "bin"] + else: + raise NotImplementedError() + + files = [] + for file_type in file_types: + files += fs.glob(os.path.join(hf_path, f"**/*.{file_type}")) + if not files: + raise FileNotFoundError( + f"No compiled assets are available on Huggingface for {model_name} with runtime {runtime_path.name}." + ) + + os.makedirs(dst_folder, exist_ok=True) + paths = [] + for file in files: + path = hf_hub_download(hf_path, file[len(hf_path) + 1 :], local_dir=dst_folder) + paths.append(path) + + return paths diff --git a/qai_hub_models/utils/image_processing.py b/qai_hub_models/utils/image_processing.py new file mode 100644 index 00000000..e8f7b6db --- /dev/null +++ b/qai_hub_models/utils/image_processing.py @@ -0,0 +1,340 @@ +from __future__ import annotations + +from typing import Callable, List, Tuple + +import cv2 +import numpy as np +import torch +import torchvision.transforms as transforms +from PIL.Image import Image +from PIL.Image import fromarray as ImageFromArray +from torch.nn.functional import interpolate, pad +from torchvision import transforms + + +def app_to_net_image_inputs( + pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], +) -> Tuple[List[np.ndarray], torch.Tensor]: + """ + Convert the provided images to application inputs. + ~~This does not change channel order. RGB stays RGB, BGR stays BGR, etc~~ + + Parameters: + pixel_values_or_image: torch.Tensor + PIL image + or + list of PIL images + or + numpy array (H W C x uint8) or (N H W C x uint8) -- both BGR or grayscale channel layout + or + pyTorch tensor (N C H W x fp32, value range is [0, 1]), BGR or grayscale channel layout + + dst_size: (height, width) + Size to which the image should be reshaped. + + Returns: + NHWC_int_numpy_frames: List[numpy.ndarray] + List of numpy arrays (one per input image with uint8 dtype, [H W C] shape, and BGR or grayscale layout. + This output is typically used for use of drawing/displaying images with PIL and CV2 + + NCHW_fp32_torch_frames: torch.Tensor + Tensor of images in fp32 (range 0:1), with shape [Batch, Channels, Height, Width], and BGR or grayscale layout. + + Based on https://github.com/zmurez/MediaPipePyTorch/blob/master/blazebase.py + """ + NHWC_int_numpy_frames: List[np.ndarray] = [] + NCHW_fp32_torch_frames: torch.Tensor + if isinstance(pixel_values_or_image, Image): + pixel_values_or_image = [pixel_values_or_image] + if isinstance(pixel_values_or_image, list): + fp32_frames = [] + for image in pixel_values_or_image: + NHWC_int_numpy_frames.append(np.array(image.convert("RGB"))) + fp32_frames.append(preprocess_PIL_image(image)) + NCHW_fp32_torch_frames = torch.cat(fp32_frames) + elif isinstance(pixel_values_or_image, torch.Tensor): + NCHW_fp32_torch_frames = pixel_values_or_image + for b_img in pixel_values_or_image: + NHWC_int_numpy_frames.append((b_img.permute(1, 2, 0) * 255).byte().numpy()) + else: + assert isinstance(pixel_values_or_image, np.ndarray) + NHWC_int_numpy_frames = ( + [pixel_values_or_image] + if len(pixel_values_or_image.shape) == 3 + else [x for x in pixel_values_or_image] + ) + NCHW_fp32_torch_frames = numpy_image_to_torch(pixel_values_or_image) + + return NHWC_int_numpy_frames, NCHW_fp32_torch_frames + + +def preprocess_PIL_image(image: Image) -> torch.Tensor: + """Convert a PIL image into a pyTorch tensor with range [0, 1] and shape NCHW.""" + transform = transforms.Compose([transforms.PILToTensor()]) # bgr image + img: torch.Tensor = transform(image) # type: ignore + img = img.float().unsqueeze(0) / 255.0 # int 0 - 255 to float 0.0 - 1.0 + return img + + +def preprocess_PIL_image_mask(image_mask: Image) -> torch.Tensor: + """Convert a PIL mask image into a pyTorch tensor with values 0. or 1.""" + transform = transforms.Compose([transforms.PILToTensor()]) + mask = transform(image_mask.convert("L")) + mask = mask.unsqueeze(0).float() + mask = (mask > 1.0) * 1.0 + return mask + + +def numpy_image_to_torch(image: np.ndarray) -> torch.Tensor: + """Convert a Numpy image (dtype uint8, shape [H W C] or [N H W C]) into a pyTorch tensor with range [0, 1] and shape NCHW.""" + image_torch = torch.from_numpy(image) + if len(image.shape) == 3: + image_torch = image_torch.unsqueeze(0) + return image_torch.permute(0, 3, 1, 2).float() / 255.0 + + +def torch_tensor_to_PIL_image(data: torch.Tensor) -> Image: + """ + Convert a Torch tensor (dtype float32) with range [0, 1] and shape CHW into PIL image CHW + """ + out = torch.clip(data, min=0.0, max=1.0) + np_out = (out.permute(1, 2, 0).detach().numpy() * 255).astype(np.uint8) + return ImageFromArray(np_out) + + +def normalize_image_transform() -> Callable: + """ + Returns a torchvision transform that returns a torch tensor normalized according to some constants. + + There are many PyTorch models that expect input images normalized with + these specific constants, so this utility can be re-used across many models. + """ + return transforms.Compose( + [ + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] + ) + + +def pad_to_square(frame: np.ndarray) -> np.ndarray: + """ + Pad an image or video frame to square dimensions with whitespace. + Assumes the input shape is of format (H, W, C). + """ + h, w, _ = frame.shape + if h < w: + top_pad = (w - h) // 2 + pad_values = ((top_pad, w - h - top_pad), (0, 0), (0, 0)) + else: + top_pad = (h - w) // 2 + pad_values = ((0, 0), (top_pad, h - w - top_pad), (0, 0)) + return np.pad(frame, pad_values, constant_values=255) + + +def resize_pad(image: torch.Tensor, dst_size: Tuple[int, int]): + """ + Resize and pad image to be shape [..., dst_size[0], dst_size[1]] + + Parameters: + image: (..., H, W) + Image to reshape. + + dst_size: (height, width) + Size to which the image should be reshaped. + + Returns: + rescaled_padded_image: torch.Tensor (..., dst_size[0], dst_size[1]) + scale: scale factor between original image and dst_size image, (w, h) + pad: pixels of padding added to the rescaled image: (left_padding, top_padding) + + Based on https://github.com/zmurez/MediaPipePyTorch/blob/master/blazebase.py + """ + height, width = image.shape[-2:] + dst_frame_height, dst_frame_width = dst_size + + h_ratio = dst_frame_height / height + w_ratio = dst_frame_width / width + if width * h_ratio > dst_frame_height: + scale = w_ratio + else: + scale = h_ratio + + import math + + new_height = math.floor(height * scale) + new_width = math.floor(width * scale) + pad_h = dst_frame_height - new_height + pad_w = dst_frame_width - new_width + + pad_top = int(pad_h // 2) + pad_bottom = int(pad_h // 2 + pad_h % 2) + pad_left = int(pad_w // 2) + pad_right = int(pad_w // 2 + pad_w % 2) + + rescaled_image = interpolate( + image, size=[int(new_height), int(new_width)], mode="bilinear" + ) + rescaled_padded_image = pad( + rescaled_image, (pad_left, pad_right, pad_top, pad_bottom) + ) + padding = (pad_left, pad_top) + + return rescaled_padded_image, scale, padding + + +def undo_resize_pad( + image: torch.Tensor, orig_size_wh: Tuple[int, int], padding: Tuple[int, int] +): + """ + Undos the efffect of resize_pad. Instead of scale, the original size + (in order width, height) is provided to prevent an off-by-one size. + """ + width, height = orig_size_wh + cropped_image = image[ + ..., padding[1] : padding[1] + height, padding[0] : padding[0] + width + ] + + rescaled_image = interpolate(cropped_image, size=[height, width], mode="bilinear") + + return rescaled_image + + +def pil_resize_pad( + image: Image, dst_size: Tuple[int, int] +) -> Tuple[Image, float, Tuple[int, int]]: + torch_image = preprocess_PIL_image(image) + torch_out_image, scale, padding = resize_pad( + torch_image, + dst_size, + ) + pil_out_image = torch_tensor_to_PIL_image(torch_out_image[0]) + return (pil_out_image, scale, padding) + + +def pil_undo_resize_pad( + image: Image, orig_size_wh: Tuple[int, int], padding: Tuple[int, int] +) -> Image: + torch_image = preprocess_PIL_image(image) + torch_out_image = undo_resize_pad(torch_image, orig_size_wh, padding) + pil_out_image = torch_tensor_to_PIL_image(torch_out_image[0]) + return pil_out_image + + +def denormalize_coordinates( + coordinates: torch.Tensor, + input_img_size: Tuple[int, int], + scale: float = 1.0, + pad: Tuple[int, int] = (0, 0), +) -> None: + """ + Maps detection coordinates from [0,1] to coordinates in the original image. + + This function can be exported and run inside inference frameworks if desired. + + Note: If included in the model, this code is likely to be unfriendly to quantization. + This is because of the high range and variability of the output tensor. + + For best quantization accuracy, this code should be run separately from the model, + or the model should de-quantize activations before running these layers. + + Inputs: + coordinates: [..., 2] tensor + coordinates. Range must be [0, 1] + + input_img_size: Tuple(int, int) + The size of the tensor that was fed to the NETWORK (NOT the original image size). + H / W is the same order as coordinates. + + scale: float + Scale factor that to resize the image to be fed to the network. + + pad: Tuple(int, int) + Padding used during resizing of input image to network input tensor. + This is the absolute # of padding pixels in the network input tensor, NOT in the original image. + H / W is in the same order as coordinates. + + Outputs: + coordinates: [..., m] tensor, where m is always (y0, x0) + The absolute coordinates of the box in the original image. + The "coordinates" input is modified in place. + """ + img_0, img_1 = input_img_size + pad_0, pad_1 = pad + + coordinates[..., 0] = ((coordinates[..., 0] * img_0 - pad_0) / scale).int() + coordinates[..., 1] = ((coordinates[..., 1] * img_1 - pad_1) / scale).int() + + +def apply_batched_affines_to_frame( + frame: np.ndarray, affines: List[np.ndarray], output_image_size: Tuple[int, int] +) -> np.ndarray: + """ + Generate one image per affine applied to the given frame. + I/O is numpy since this uses cv2 APIs under the hood. + + Inputs: + frame: np.ndarray + Frame on which to apply the affine. Shape is [ H W C ], dtype must be np.byte. + affines: List[np.ndarray] + List of 2x3 affine matrices to apply to the frame. + output_image_size: torch.Tensor + Size of each output frame. + + Outputs: + images: np.ndarray + Computed images. Shape is [B H W C] + """ + assert ( + frame.dtype == np.byte or frame.dtype == np.uint8 + ) # cv2 does not work correctly otherwise. Don't remove this assertion. + imgs = [] + for affine in affines: + img = cv2.warpAffine(frame, affine, output_image_size) + imgs.append(img) + return np.stack(imgs) + + +def apply_affine_to_coordinates( + coordinates: torch.Tensor, affine: torch.Tensor +) -> torch.Tensor: + """ + Apply the given affine matrix to the given coordinates. + + Inputs: + coordinates: torch.Tensor + Coordinates on which to apply the affine. Shape is [ ..., 2 ], where 2 == [X, Y] + affines: torch.Tensor + Affine matrix to apply to the coordinates. + + Outputs: + Transformed coordinates. Shape is [ ..., 2 ], where 2 == [X, Y] + """ + return (affine[:, :2] @ coordinates.T + affine[:, 2:]).T + + +def compute_vector_rotation( + vec_start: torch.Tensor, + vec_end: torch.Tensor, + offset_rads: float | torch.Tensor = 0, +) -> torch.Tensor: + """ + From the given vector, compute the rotation of the vector with added offset. + + Inputs: + vec_start: torch.Tensor + Starting point of the vector. Shape is [B, 2], where 2 == (x, y) + vec_end: torch.Tensor + Ending point of the vector. Shape is [B, 2], where 2 == (x, y) + offset_rads: float | torch.Tensor + Offset to subtract from the rotation calculation. + Can be size [1] or [ Batch ] + + Outputs: + theta: computed rotation angle in radians. Shape is [Batch] + """ + return ( + torch.atan2( + vec_start[..., 1] - vec_end[..., 1], vec_start[..., 0] - vec_end[..., 0] + ) + - offset_rads + ) diff --git a/qai_hub_models/utils/inference.py b/qai_hub_models/utils/inference.py new file mode 100644 index 00000000..2e860974 --- /dev/null +++ b/qai_hub_models/utils/inference.py @@ -0,0 +1,284 @@ +from __future__ import annotations + +import os +import tempfile +from typing import List, Tuple + +import numpy as np +import qai_hub as hub +import torch +from qai_hub.public_rest_api import DatasetEntries + +from qai_hub_models.utils.base_model import BaseModel, SourceModelFormat, TargetRuntime +from qai_hub_models.utils.input_spec import InputSpec +from qai_hub_models.utils.qai_hub_helpers import ( + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) +from qai_hub_models.utils.qnn_helpers import is_qnn_hub_model + +try: + from qai_hub_models.utils.quantization_aimet import AIMETQuantizableMixin +except NotImplementedError: + AIMETQuantizableMixin = None # type: ignore + + +def prepare_compile_zoo_model_to_hub( + model: BaseModel, + source_model_format: SourceModelFormat, + target_runtime: TargetRuntime, + output_path: str = "", + input_spec: InputSpec | None = None, + check_trace: bool = True, + prepare_compile_options_only: bool = False, +) -> Tuple[str | None, str]: + """ + Args: + + - (source_model_format, target_runtime): One of the followings + + (1) (ONNX, QNN) + + (a) For fp32 model, torch -> onnx -> qnn. + + (b) For AIMET, torch -> onnx + aimet encodings -> qnn + + (2) (ONNX, TFLITE) + + (a) For fp32, torch (fp32) -> onnx -> tflite, + + (b) For quantized, torch(AIMET) -> onnx + aimet .encodings -> tflite + (via qnn-onnx-converter). + + (3) (TORCHSCRIPT, TFLITE) + + (a) Fp32: Invalid option for model not subclass of AIMETQuantizableMixin + + (b) For AIMETQuantizableMixin subclass, torch(AIMET) -> + torchscript with embedded quantizer -> tflite + + (4) (TORCHSCRIPT, QNN) + + (a) For fp32, torch -> qnn (via qnn-torch-converter, aka + --use_qnn_pytorch_converter flag in Hub) + + (b) For AIMETQuantizableMixin subclass, torch(AIMET) -> + torchscript with embedded quantizer -> qnn (via + qnn-pytorch-converter) + + Returns: + + Path to source model that can be used directly with hub.upload_model or + hub.submit_compile_job. + """ + is_aimet = AIMETQuantizableMixin is not None and isinstance( + model, AIMETQuantizableMixin + ) + + model_name = model.__class__.__name__ + + compilation_options = model.get_hub_compile_options(target_runtime) + + if is_aimet: + if source_model_format == SourceModelFormat.ONNX: + + def export_model_func(): + return model.convert_to_onnx_and_aimet_encodings( + output_path, model_name=model_name + ) + + elif ( + source_model_format == SourceModelFormat.TORCHSCRIPT + and target_runtime == TargetRuntime.TFLITE + ): + + def export_model_func(): + traced_model = model.convert_to_quantized_torchscript( + input_spec=input_spec, check_trace=check_trace + ) + model_path = os.path.join(output_path, model_name + ".pt") + os.makedirs(output_path, exist_ok=True) + torch.jit.save(traced_model, model_path) + return model_path + + else: # Torchscript and QNN + + def export_model_func(): + exported_model = model.convert_to_torchscript_and_aimet_encodings( # type: ignore + output_path, + model_name=model_name, + input_spec=input_spec, + ) + return exported_model + + else: # fp32 + + def export_model_func(): + traced_model = model.convert_to_torchscript( + input_spec=input_spec, check_trace=check_trace + ) + model_path = os.path.join(output_path, model_name + ".pt") + os.makedirs(output_path, exist_ok=True) + torch.jit.save(traced_model, model_path) + return model_path + + if ( + target_runtime == TargetRuntime.TFLITE + and source_model_format == SourceModelFormat.ONNX + ): + pass # default is good + + if prepare_compile_options_only: + return None, compilation_options + else: + return export_model_func(), compilation_options + + +def compile_zoo_model_to_hub( + model: BaseModel, + device: hub.Device, + source_model_format: SourceModelFormat, + target_runtime: TargetRuntime, + calibration_data: DatasetEntries | None = None, + input_spec: InputSpec | None = None, + inference_options: str = "", + check_trace: bool = True, +) -> HubModel: + """ + Similar to `prepare_compile_zoo_model_to_hub`, but also performs the + compilation on AI Hub and construct a HubModel object. + """ + + if input_spec is None: + input_spec = model.get_input_spec() + + model_name = model.__class__.__name__ + + with tempfile.TemporaryDirectory() as tmp_dir: + assert tmp_dir is not None + source_model, compilation_options = prepare_compile_zoo_model_to_hub( + model=model, + source_model_format=source_model_format, + target_runtime=target_runtime, + output_path=tmp_dir, + check_trace=check_trace, + ) + + compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=device, + name=f"{model_name}_{source_model_format.name}_{target_runtime.name}", + options=compilation_options, + calibration_data=calibration_data, + ) + assert isinstance(compile_job, hub.CompileJob) + if not compile_job.wait().success: + job_msg = compile_job.get_status().message or "(no job failure message)" + raise ValueError(f"Compile job {compile_job} failed: {job_msg}") + + hub_model = compile_job.get_target_model() + assert hub_model is not None + input_names = list(model.get_input_spec().keys()) + return HubModel( + hub_model, + input_names, + device, + inference_options=inference_options, + ) + + +class HubModel: + """ + Class that behaves like a pytorch model except when called, it runs an + inference job on hub and returns a torch output. + + Intended to be passed as in input to app.py to run an app on-device. + + Parameters: + input_names: List of input names to the model. + device: Device on which to execute inference. + hub_model_id: ID of Model stored in hub that will be used to run inference. + model: If hub_model_id is absent, this model is compiled and used for inference. + + Returns: + Callable that mimics the I/O of a torch model and evaluates inference on device. + """ + + def __init__( + self, + model: hub.Model, + input_names: List[str], + device: hub.Device, + inference_options: str = "", + ): + self.model = model + self.input_names = input_names + self.device = device + self.inference_options = inference_options + + def __call__( + self, + *input_tensors: torch.Tensor + | List[torch.Tensor] + | hub.Dataset + | DatasetEntries, + ) -> torch.Tensor | Tuple[torch.Tensor, ...]: + inputs: hub.Dataset | DatasetEntries + if len(input_tensors) == 1 and isinstance(input_tensors[0], hub.Dataset): + inputs = input_tensors[0] + else: + # Upload dataset + inputs = {} + for name, tensor in zip(self.input_names, input_tensors): + if isinstance(tensor, (list, tuple)): + inputs[name] = [t.detach().numpy() for t in tensor] # type: ignore + else: + inputs[name] = [tensor.detach().numpy()] # type: ignore + target_runtime = ( + TargetRuntime.QNN if is_qnn_hub_model(self.model) else TargetRuntime.TFLITE + ) + channel_last_input, channel_last_output = "", "" + if self.model.producer is not None: + model_options = self.model.producer.options.strip().split() + for option_num in range(len(model_options)): + if model_options[option_num] == "--force_channel_last_input": + channel_last_input = model_options[option_num + 1] + if model_options[option_num] == "--force_channel_last_output": + channel_last_output = model_options[option_num + 1] + if channel_last_input != "": + inputs = transpose_channel_first_to_last( + channel_last_input, inputs, target_runtime + ) + + inference_job = hub.submit_inference_job( + model=self.model, + inputs=inputs, + device=self.device, + name=f"{self.model.name}_demo_inference", + options=self.inference_options, + ) + assert isinstance(inference_job, hub.InferenceJob) + if not inference_job.wait().success: + job_msg = inference_job.get_status().message or "(no job failure message)" + raise ValueError(f"Inference job {inference_job} failed: {job_msg}") + + output_ds_handle = inference_job.get_output_dataset() + assert output_ds_handle is not None + output_dataset = output_ds_handle.download() + + if channel_last_output != "": + output_dataset = transpose_channel_last_to_first( + channel_last_output, + output_dataset, # type: ignore + target_runtime, + ) # type: ignore + + output_torch = [ + torch.from_numpy(np.concatenate(outputs, axis=0)) + for outputs in output_dataset.values() # type: ignore + ] + + if len(output_torch) == 1: + return output_torch[0] + return tuple(output_torch) diff --git a/qai_hub_models/utils/input_spec.py b/qai_hub_models/utils/input_spec.py new file mode 100644 index 00000000..57dc85e9 --- /dev/null +++ b/qai_hub_models/utils/input_spec.py @@ -0,0 +1,32 @@ +from typing import Dict, List, Optional, Tuple + +import torch + +# PyTorch trace doesn't capture the input specs. Hence we need an additional +# InputSpec (name -> (shape, type)) when submitting profiling job to Qualcomm AI Hub. +# This is a subtype of qai_hub.InputSpecs +InputSpec = Dict[str, Tuple[Tuple[int, ...], str]] + + +def str_to_torch_dtype(s): + return dict( + int32=torch.int32, + float32=torch.float32, + )[s] + + +def make_torch_inputs(spec: InputSpec, seed: Optional[int] = 42) -> List[torch.Tensor]: + """Make sample torch inputs from input spec""" + torch_input = [] + generator = None + if seed is not None: + generator = torch.Generator() + generator.manual_seed(seed) + for sp in spec.values(): + torch_dtype = str_to_torch_dtype(sp[1]) + if sp[1] in {"int32"}: + t = torch.randint(10, sp[0], generator=generator).to(torch_dtype) + else: + t = torch.rand(sp[0], generator=generator).to(torch_dtype) + torch_input.append(t) + return torch_input diff --git a/qai_hub_models/utils/measurement.py b/qai_hub_models/utils/measurement.py new file mode 100644 index 00000000..09b963b4 --- /dev/null +++ b/qai_hub_models/utils/measurement.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import os +import tempfile +from pathlib import Path +from typing import List, Union + +import numpy as np +import qai_hub as hub + + +def display_with_sig_figs(num: float, num_sig_figs: int = 3) -> str: + """ + Displays the given number as a string with the appropriate number of + significant figures. Example: + display_with_sig_figs(1234.2, num_sig_figs=3) -> "1230" + Parameters: + num: Number to display. + num_sig_figs: How many sig figs to use. + """ + rounded_num = float(f"{num:.{num_sig_figs}g}") + num_digits = len(str(int(rounded_num))) + + # Only display as many numbers after the decimal point to fit number of sig figs + return f"{rounded_num:.{max(0, num_sig_figs - num_digits)}f}" + + +def get_formatted_size(size: float, units: List[str], unit_step_size: float) -> str: + """ + Formats the number according to the units provided. For example: + format_size(3600, units=["B", "KB", ...], unit_step_size=1024.0) + would return "3.6KB" + Parameters: + num: Raw count of size. + units: A list of increasing unit sizes (e.g. ["B", "KB", ...]) + unit_step_size: The ratio in size between successive units. + """ + + unit_index = 0 + + while size >= unit_step_size and unit_index < len(units) - 1: + size /= unit_step_size + unit_index += 1 + + return f"{display_with_sig_figs(size)}{units[unit_index]}" + + +def get_checkpoint_file_size(model_path: str, as_str: bool = True) -> Union[str, int]: + """ + Computes how much memory the model checkpoint consumes. + Parameters: + model_path: Path to the model checkpoint file. + as_str: Whether to return the result as an int or a string formatted to 2 sig figs. + """ + num_bytes = os.path.getsize(model_path) + + if not (as_str): + return num_bytes + + return get_formatted_size(num_bytes, [" B", " KB", " MB", " GB", " TB"], 1024.0) + + +def get_tflite_unique_parameters( + model_path: str, as_str: bool = True +) -> Union[str, int]: + """ + TFLite parameters are defined at two levels: Tensors and Buffers + + Only tensors can tell us how many parameters, but we do not want to over-count + tensors that point to the same buffers. So, we keep track of all buffers + we have counted through tensors. + """ + from tensorflow.lite.python import schema_py_generated as schema_fb + + with open(model_path, "rb") as f: + tflite_model = f.read() + model_obj = schema_fb.Model.GetRootAsModel(tflite_model, 0) + model = schema_fb.ModelT.InitFromObj(model_obj) + + parameter_cnt = 0 + buffers_counted = set() + for graph in model.subgraphs: + for tensor in graph.tensors: + buf_index = tensor.buffer + + buffer = model.buffers[buf_index] + if buffer.data is not None: + if buf_index not in buffers_counted: + parameter_cnt += np.prod(tensor.shape) + buffers_counted.add(buf_index) + + if not as_str: + return parameter_cnt + + return get_formatted_size(parameter_cnt, ["", "K", "M", "B", "T"], 1000.0) + + +def get_model_size_mb(hub_model: hub.Model) -> float: + """Return target model size in MB. This is a special case for ease of + testing""" + assert hub_model is not None + with tempfile.TemporaryDirectory() as tmp_dir: + download_path = Path(tmp_dir) / "model" + # Download the model into the temporary directory + hub_model.download(download_path) # type: ignore + size_mb = get_disk_size(download_path, unit="MB") + return size_mb + + +def get_disk_size(path: str | Path, unit: str = "byte") -> float: + """ + Returns file or directory size in `unit` + + Args: + - unit: One of ["byte", "MB"] + """ + if os.path.isdir(path): + # Traverse the directory and add up the file sizes. + total_size = 0 + for dirpath, dirnames, filenames in os.walk(path): + for f in filenames: + fp = os.path.join(dirpath, f) + # skip if it is symbolic link + if not os.path.islink(fp): + total_size += os.path.getsize(fp) + else: + total_size = os.path.getsize(path) + + if unit == "MB": + total_size /= 2**20 # type: ignore + return total_size diff --git a/qai_hub_models/utils/model_adapters.py b/qai_hub_models/utils/model_adapters.py new file mode 100644 index 00000000..fedec825 --- /dev/null +++ b/qai_hub_models/utils/model_adapters.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from typing import Tuple + +import numpy as np +import torch + + +def flatten(obj): + """Flatten nested list or tuple""" + tgt_type = (list, tuple) # targeted types + flattened_list = [] + for item in obj: + if isinstance(item, tgt_type): + flattened_list.extend(flatten(item, tgt_type)) + else: + flattened_list.append(item) + return flattened_list + + +class TorchNumpyAdapter: + def __init__(self, base_model: torch.jit.ScriptModule | torch.nn.Module): + """ + Wraps torch models to use numpy input / outputs + """ + assert isinstance(base_model, (torch.jit.ScriptModule, torch.nn.Module)) + self.base_model = base_model + + def __call__(self, *args) -> Tuple[np.ndarray, ...]: + input_data = tuple(torch.from_numpy(t) for t in args) + res = self.base_model(*input_data) + if isinstance(res, torch.Tensor): + output = res.detach().numpy() + else: + output = tuple(t.detach().numpy() for t in flatten(res)) + if isinstance(output, tuple) and len(output) == 1: + return output[0] + return output diff --git a/qai_hub_models/utils/model_card.py b/qai_hub_models/utils/model_card.py new file mode 100644 index 00000000..1e768a07 --- /dev/null +++ b/qai_hub_models/utils/model_card.py @@ -0,0 +1,302 @@ +import datetime +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, List, Optional, Union + +import qai_hub as hub + + +def chipset_marketting_name(chipset) -> str: + """Sanitize chip name to match marketting.""" + chip = [word.capitalize() for word in chipset.split("-")] + details_to_remove = [] + for i in range(len(chip)): + if chip[i] == "8gen3": + chip[i] = "8 Gen 3" + if chip[i] == "8gen2": + chip[i] = "8 Gen 2" + elif chip[i] == "8gen1": + chip[i] = "8 Gen 1" + elif chip[i] == "Snapdragon": + # Marketing name for Qualcomm Snapdragon is Snapdragon® + chip[i] = "Snapdragon®" + elif chip[i] == "Qualcomm": + details_to_remove.append(chip[i]) + + for detail in details_to_remove: + chip.remove(detail) + return " ".join(chip) + + +class MODEL_CARD_RUNTIMES(Enum): + """Runtime to be stored in model card.""" + + TORCHSCRIPT_ONNX_TFLITE = 100 + TORCHSCRIPT_ONNX_QNN = 101 + + @staticmethod + def from_string(string: str) -> "MODEL_CARD_RUNTIMES": + return MODEL_CARD_RUNTIMES["TORCHSCRIPT_ONNX_" + string.upper()] + + +@dataclass +class ModelRun: + model_id: str + profile_job_id: str + runtime: MODEL_CARD_RUNTIMES + + def chipset(self) -> Optional[str]: + """Chipset the job was run on.""" + if self.profile_job is not None: + hub_device = self.profile_job.device + for attr in hub_device.attributes: + if attr.startswith("chipset:qualcomm"): + return attr.split(":")[1] + return "" + + @property + def profile_job(self): + """Get the hub.ProfileJob object.""" + if len(self.profile_job_id) > 0: + return hub.get_job(self.profile_job_id) + return None + + def job_status(self) -> str: + """Get the job status of the profile job.""" + if self.profile_job is not None: + if self.profile_job.get_status().success: + return "Passed" + elif self.profile_job.get_status().failure: + return "Failed" + return "Skipped" + + @property + def quantized(self) -> str: + """Quantized models are marked so precision can be correctly recorded.""" + return "Yes" if self.model_id.endswith("_quantized") else "No" + + @property + def profile_results(self): + """Profile results from profile job.""" + if self.job_status() == "Passed": + return self.profile_job.download_profile() + return None + + def get_inference_time(self) -> Union[float, str]: + """Get the inference time from the profile job.""" + if self.profile_results is not None: + return float( + self.profile_results["execution_summary"]["estimated_inference_time"] + ) + return "null" + + def get_throughput(self) -> Union[float, str]: + """Get the throughput from the profile job.""" + if not isinstance(self.get_inference_time(), str): + return 1000000 / self.get_inference_time() # type: ignore + return "null" + + def get_layer_info(self, unit: str) -> int: + """Count layers per compute unit.""" + if self.profile_results is not None: + count: int = 0 + count = sum( + 1 + for detail in self.profile_results["execution_detail"] + if detail["compute_unit"] == unit + ) + return count + return 0 + + def npu(self) -> Any: + """Get number of layers running on NPU.""" + return self.get_layer_info("NPU") if self.profile_results is not None else 0 + + def gpu(self) -> Any: + """Get number of layers running on GPU.""" + return self.get_layer_info("GPU") if self.profile_results is not None else 0 + + def cpu(self) -> Any: + """Get number of layers running on CPU.""" + return self.get_layer_info("CPU") if self.profile_results is not None else 0 + + def total(self) -> Any: + """Get the total number of layers.""" + return self.npu() + self.gpu() + self.cpu() + + def primary_compute_unit(self) -> str: + """Get the primary compute unit.""" + layers_npu = self.npu() + layers_gpu = self.gpu() + layers_cpu = self.cpu() + + if layers_npu == 0 and layers_gpu == 0 and layers_cpu == 0: + return "null" + compute_unit_for_most_layers = max(layers_cpu, layers_gpu, layers_npu) + if compute_unit_for_most_layers == layers_npu: + return "NPU" + elif compute_unit_for_most_layers == layers_gpu: + return "GPU" + return "CPU" + + def get_peak_memory_range(self) -> Dict[str, int]: + """Get the estimated peak memory range.""" + if self.profile_results is not None: + low, high = self.profile_results["execution_summary"][ + "inference_memory_peak_range" + ] + return dict(min=low, max=high) + return dict(min=0, max=0) + + def precision(self) -> str: + """Get the precision of the model based on the run.""" + if self.profile_results is not None: + compute_unit = self.primary_compute_unit() + if compute_unit == "CPU": + return "fp32" + if self.quantized == "Yes": + return "int8" + return "fp16" + return "null" + + +@dataclass +class ModelPerf: + model_runs: List[ModelRun] + + def supported_chipsets(self, chips) -> List[str]: + """Return all the supported chipsets given the chipset it works on.""" + supported_chips = chips + for chip in chips: + if chip == "qualcomm-snapdragon-8gen2": + supported_chips.extend( + ["qualcomm-snapdragon-8gen1", "qualcomm-snapdragon-888"] + ) + if chip == "qualcomm-snapdragon-855": + supported_chips.extend( + ["qualcomm-snapdragon-845", "qualcomm-snapdragon-865"] + ) + return supported_chips + + def supported_chipsets_santized(self, chips) -> List[str]: + """Santize the chip name passed via hub.""" + chips = [chip for chip in chips if chip != ""] + return sorted( + list( + set( + [ + chipset_marketting_name(chip) + for chip in self.supported_chipsets(chips) + ] + ) + ) + ) + + def supported_devices(self, chips) -> List[str]: + """Return all the supported devicesgiven the chipset being used.""" + supported_devices = [] + for chip in self.supported_chipsets(chips): + supported_devices.extend( + [ + device.name + for device in hub.get_devices(attributes=f"chipset:{chip}") + ] + ) + supported_devices.extend( + [ + "Google Pixel 3", + "Google Pixel 3a", + "Google Pixel 4", + "Google Pixel 3a XL", + "Google Pixel 4a", + "Google Pixel 5a 5G", + ] + ) + return sorted(list(set(supported_devices))) + + def supported_oses(self) -> List[str]: + """Return all the supported operating systems.""" + return ["Android"] + + def reference_device_info(self) -> Dict[str, str]: + """Return a reference ID.""" + chipset = "qualcomm-snapdragon-8gen2" + hub_device = hub.get_devices("Samsung Galaxy S23 Ultra")[0] + device_name = hub_device.name + os_version = hub_device.os + os_name, form_factor, manufacturer = "", "", "" + for attr in hub_device.attributes: + if attr.startswith("vendor"): + manufacturer = attr.split(":")[-1] + if attr.startswith("format"): + form_factor = attr.split(":")[-1] + if attr.startswith("os"): + os_name = attr.split(":")[-1].capitalize() + chipset = chipset_marketting_name(chipset) + device_info = dict( + name=device_name, + os=os_version, + form_factor=form_factor.capitalize(), + os_name=os_name, + manufacturer=manufacturer.capitalize(), + chipset=chipset, + ) + return device_info + + def performance_metrics(self): + """Performance metrics as per model card.""" + perf_card = dict() + + # Figure out unique models in various baselines + unique_model_ids = [] + chips = [] + for run in self.model_runs: + if run.model_id not in unique_model_ids: + unique_model_ids.append(run.model_id) + if run.chipset not in chips: + chips.append(run.chipset()) + + perf_card["aggregated"] = dict( + supported_oses=self.supported_oses(), + supported_devices=self.supported_devices(chips), + supported_chipsets=self.supported_chipsets_santized(chips), + ) + + perf_per_model = [] + + for mid in unique_model_ids: + perf_per_device = [] + # Calculate per data per runtime + perf_per_runtime = dict() + for run in self.model_runs: + if run.model_id == mid: + runtime_name = run.runtime.name.lower() + perf_per_runtime[runtime_name] = dict( + inference_time=run.get_inference_time(), + throughput=run.get_throughput(), + estimated_peak_memory_range=run.get_peak_memory_range(), + primary_compute_unit=run.primary_compute_unit(), + precision=run.precision(), + layer_info=dict( + layers_on_npu=run.npu(), + layers_on_gpu=run.gpu(), + layers_on_cpu=run.cpu(), + total_layers=run.total(), + ), + job_id=run.profile_job_id, + job_status=run.job_status(), + ) + + # Per model, the device used and timestamp for model card + perf_per_runtime["reference_device_info"] = self.reference_device_info() + perf_per_runtime["timestamp"] = datetime.datetime.utcnow().isoformat() + "Z" + + perf_per_device.append(perf_per_runtime) + + perf_model = dict(name=mid, performance_metrics=perf_per_device) + perf_model["name"] = mid + perf_per_model.append(perf_model) + + # Perf card with multiple models + perf_card["models"] = perf_per_model + return perf_card diff --git a/qai_hub_models/utils/path_helpers.py b/qai_hub_models/utils/path_helpers.py new file mode 100644 index 00000000..3729acf9 --- /dev/null +++ b/qai_hub_models/utils/path_helpers.py @@ -0,0 +1,28 @@ +from pathlib import Path +from typing import Optional + +MODELS_PACKAGE_NAME = "models" +QAIHM_PACKAGE_NAME = "qai_hub_models" + + +def get_all_models(): + zoo_root = get_qaihm_models_root() + all_models = [] + for subdir in zoo_root.iterdir(): + if not subdir.is_dir(): + continue + # Heuristic to see if this is a model we should generate export.py for. + if (subdir / "model.py").exists() and (subdir / "test.py").exists(): + all_models.append(subdir.name) + return all_models + + +def get_qaihm_package_root() -> Path: + """Get local path to qaihm package root.""" + return Path(__file__).parent.parent + + +def get_qaihm_models_root(package_root: Optional[Path] = None) -> Path: + if package_root is None: + package_root = get_qaihm_package_root() + return package_root / MODELS_PACKAGE_NAME diff --git a/qai_hub_models/utils/perf_summary.py b/qai_hub_models/utils/perf_summary.py new file mode 100644 index 00000000..26306e6e --- /dev/null +++ b/qai_hub_models/utils/perf_summary.py @@ -0,0 +1,253 @@ +import datetime +import os +from typing import Dict, List, Tuple + +from prettytable import PrettyTable + +RUNTIMES_TO_COMPARE = ["torchscript_onnx_qnn", "torchscript_onnx_tflite"] + + +class PerformanceSummary: + """ + Generates Perf Summary of two 'performance_metrics' from perf.yaml + + Perf summary is generated w.r.t 'perf_buckets' to summarize difference in decreasing order + - "INF" -> Inference failure toggled. + - 10 -> Speedup difference >= 10 and so on ... + + Why use speedup difference? + - Speedup is relative to baseline measured with similar constraints and changes + - Speedup difference gives a generate sense on the how Tetra performance diverged w.r.t. baseline at that point + + What all to capture in the summary (Summary of Interest) ? + 1. Inferences that started to fail or work (Speedup = "INF") + 2. Speedup difference >= 0.1 (check models closely from higher buckets) + 3. Missing devices (new runs missing data for certain devices) + 4. New models (models with new perf.yamls) + 5. Empty perf reports (models with no passing jobs) + """ + + def __init__(self): + # List of new reports being added + self.new_perf_report: List[Tuple[str]] = [] + + # Device present in previous run, but missing in new + self.missing_devices: List = [] + + # Perf report with no passing job + self.empty_perf_report: List[Tuple[str]] = [] + + # Perf buckets to track + self.perf_buckets = ["inf", 10, 5, 2, 1.5, 1.3, 1.2, 1.1, 1.05, 1.03] + + # Only track PerfSummary for Android + self.tracked_oses: List = ["Android"] + + # Map of perf_bucket -> List of tuple of progression summary entry + self.progressions: Dict = {} + + # Map of perf_bucket -> List of tuple of regression summary entry + self.regressions: Dict = {} + + for each in self.perf_buckets: + self.progressions[each] = [] + self.regressions[each] = [] + + def add_missing_model(self, model_id: str): + self.new_perf_report.append((model_id,)) + + def _format_speedup(self, num): + if isinstance(num, str): + return num + return float(format(num, ".5f")) + + def update_summary(self, model_id: str, previous_report, new_report): + prev_perf_metrics = {} + new_perf_metrics = {} + + # Create chipset to perf metric + for i in range(len(previous_report["models"])): + for j in range(len(new_report["models"])): + if ( + previous_report["models"][i]["name"] + == new_report["models"][j]["name"] + ): + for prev_metric in previous_report["models"][i][ + "performance_metrics" + ]: + if "chipset" in prev_metric["reference_device_info"]: + ref_device = prev_metric["reference_device_info"]["chipset"] + prev_perf_metrics[ref_device] = prev_metric + + for new_metric in new_report["models"][j]["performance_metrics"]: + if "chipset" in new_metric["reference_device_info"]: + ref_device = new_metric["reference_device_info"]["chipset"] + new_perf_metrics[ref_device] = new_metric + + if len(prev_perf_metrics) == 0 or len(new_perf_metrics) == 0: + self.empty_perf_report.append((model_id,)) + + for device in prev_perf_metrics.keys(): + device_info = prev_perf_metrics[device]["reference_device_info"] + if device_info["os_name"] not in self.tracked_oses: + continue + + # Case 3: Chipset is missing in new data + if device not in new_perf_metrics: + self.missing_devices.append((model_id, device)) + continue + + for runtime_type in RUNTIMES_TO_COMPARE: + prev_inference_time = prev_perf_metrics[device][runtime_type][ + "inference_time" + ] + new_inference_time = new_perf_metrics[device][runtime_type][ + "inference_time" + ] + if new_inference_time == prev_inference_time: + continue + + if new_inference_time == "null" or prev_inference_time == "null": + # Case 1: Model either failed to infer or had a successful run + summary_entry = ( + model_id, + runtime_type, + "inf", + self._format_speedup(new_inference_time), + self._format_speedup(prev_inference_time), + device_info["chipset"], + device_info["os"], + ) + + if new_inference_time == "null": + self.regressions["inf"].append(summary_entry) + else: + self.progressions["inf"].append(summary_entry) + continue + + # Case 2: Bucketize speedup difference + progression_speedup = float(prev_inference_time) / float( + new_inference_time + ) + regression_speedup = float(new_inference_time) / float( + prev_inference_time + ) + is_progression = progression_speedup >= 1 + speedup = progression_speedup if is_progression else regression_speedup + + for bucket in self.perf_buckets[1:]: + if bucket <= speedup: + summary = ( + model_id, + runtime_type, + self._format_speedup(speedup), + self._format_speedup(new_inference_time), + self._format_speedup(prev_inference_time), + device_info["chipset"], + device_info["os"], + ) + if is_progression: + self.progressions[bucket].append(summary) + else: + self.regressions[bucket].append(summary) + break + + def _get_summary_table(self, bucket_id, get_progressions=True): + """ + Returns Summary Table for given bucket + Args: + bucket_id : bucket_id from perf_buckets + """ + table = PrettyTable( + [ + "Model ID", + "Runtime", + "Kx faster" if get_progressions else "Kx slower", + "New Inference time", + "Prev Inference time", + "Chipset", + "OS", + ] + ) + data = self.progressions if get_progressions else self.regressions + rows = data[bucket_id] + rows.sort(key=lambda k: k[2]) + table.add_rows(rows) + return table + + def _has_perf_changes(self): + """Returns True if there are perf changes""" + for _, val in self.progressions.items(): + if len(val) > 0: + return True + for _, val in self.regressions.items(): + if len(val) > 0: + return True + return False + + def print_summary(self): + """ + Prints Perf change summary captured so far. + """ + + file_unique_name = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + test_results_path = os.path.join("build", "test-results") + os.makedirs(test_results_path, exist_ok=True) + summary_file_path = os.path.join( + test_results_path, f"perf-summary-{file_unique_name}.txt" + ) + + with open(summary_file_path, "w") as sf: + sf.write("================= Perf Change Summary =================") + if self._has_perf_changes(): + sf.write("\n\n----------------- Regressions -----------------\n") + # Dumps Point 1 and 2 from Summary of Interest + # 1. Inferences that started to fail (Speedup = "INF") + # 2. Slower than previous run + for bucket in self.perf_buckets: + if len(self.regressions[bucket]) > 0: + sf.write( + f"\n----------------- >= {bucket}x slower -----------------\n" + ) + sf.write( + str(self._get_summary_table(bucket, get_progressions=False)) + ) + + sf.write("\n\n----------------- Progressions -----------------\n") + + # Dumps Point 1 and 2 from Summary of Interest + # 1. Inferences that started to work (Speedup = "INF") + # 2. Faster than previous run + for bucket in self.perf_buckets: + if len(self.progressions[bucket]) > 0: + sf.write( + f"\n----------------- >= {bucket}x faster -----------------\n" + ) + sf.write(str(self._get_summary_table(bucket))) + else: + sf.write("\nNo significant changes observed.") + + if len(self.missing_devices) > 0: + # 3. Missing devices (New runs missing data for certain devices) + sf.write("\n----------------- Missing devices -----------------\n") + table = PrettyTable(["Model ID", "Missing Device"]) + table.add_rows(self.missing_devices) + sf.write(str(table)) + + if len(self.new_perf_report) > 0: + # 4. New Models (Models that did not have perf.yaml previously) + sf.write("\n----------------- New models -----------------\n") + table = PrettyTable(["Model ID"]) + table.add_rows(self.new_perf_report) + sf.write(str(table)) + + if len(self.empty_perf_report) > 0: + # 5. Empty reports (Models with no passing jobs) + sf.write( + "\n----------------- Empty reports (No passing jobs) -----------------\n" + ) + table = PrettyTable(["Model ID"]) + table.add_rows(self.empty_perf_report) + sf.write(str(table)) + + print(f"Perf change summary written to {summary_file_path}") diff --git a/qai_hub_models/utils/printing.py b/qai_hub_models/utils/printing.py new file mode 100644 index 00000000..5e52d19f --- /dev/null +++ b/qai_hub_models/utils/printing.py @@ -0,0 +1,130 @@ +from collections import Counter +from pathlib import Path +from typing import Any, Dict, List, Optional + +import numpy as np +import qai_hub as hub +from prettytable import PrettyTable +from qai_hub.client import SourceModelType + +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import generate_comparison_metrics +from qai_hub_models.utils.config_loaders import QAIHMModelPerf +from qai_hub_models.utils.qnn_helpers import is_qnn_hub_model + +_INFO_DASH = "-" * 60 + + +def print_inference_metrics( + inference_job: hub.InferenceJob, + inference_result: Dict[str, List[np.ndarray]], + torch_out: List[np.ndarray], + outputs_to_skip: Optional[List[int]] = None, +) -> None: + inference_data = [ + np.concatenate(outputs, axis=0) for outputs in inference_result.values() + ] + output_names = list(inference_result.keys()) + metrics = generate_comparison_metrics(torch_out, inference_data) + print( + f"\nComparing on-device vs. local-cpu inference for {inference_job.name.title()}." + ) + + table = PrettyTable(align="l") + table.field_names = ["Name", "Shape", "Peak Signal-to-Noise Ratio (PSNR)"] + outputs_to_skip = outputs_to_skip or [] + i = 0 + while i in metrics or i in outputs_to_skip: + if i in outputs_to_skip or np.prod(np.array(metrics[i].shape)) < 5: + table.add_row([output_names[i], metrics[i].shape, "Skipped"]) + i += 1 + continue + table.add_row([output_names[i], metrics[i].shape, f"{metrics[i].psnr:.4g} dB"]) + i += 1 + + print(table.get_string()) + last_line = f"More details: {inference_job.url}" + print(last_line) + + +def print_profile_metrics_from_job( + profile_job: hub.ProfileJob, + profile_data: Dict[str, Any], +): + compute_unit_counts = Counter( + [op.get("compute_unit", "UNK") for op in profile_data["execution_detail"]] + ) + execution_summary = profile_data["execution_summary"] + inference_time_ms = execution_summary["estimated_inference_time"] / 1000 + peak_memory_bytes = execution_summary["inference_memory_peak_range"] + print(f"\n{_INFO_DASH}") + print(f"Performance results on-device for {profile_job.name.title()}.") + print(_INFO_DASH) + + if profile_job.model.model_type == SourceModelType.TFLITE: + runtime = TargetRuntime.TFLITE + elif is_qnn_hub_model(profile_job.model): + runtime = TargetRuntime.QNN + else: + raise NotImplementedError() + + print_profile_metrics( + QAIHMModelPerf.ModelRuntimePerformanceDetails( + profile_job.model.name, + profile_job.device.name, + profile_job.device.os, + runtime, + inference_time_ms, + peak_memory_bytes, + compute_unit_counts, + ) + ) + print(_INFO_DASH) + last_line = f"More details: {profile_job.url}\n" + print(last_line) + + +def print_profile_metrics( + details: QAIHMModelPerf.ModelRuntimePerformanceDetails, +): + inf_time = details.inference_time_ms + peak_memory_bytes = f"[{round(details.peak_memory_bytes[0] / 1e6)}, {round(details.peak_memory_bytes[1] / 1e6)}]" + num_ops = sum(details.compute_unit_counts.values()) + compute_units = [ + f"{unit} ({num_ops} ops)" + for unit, num_ops in details.compute_unit_counts.items() + ] + + rows = [ + ["Device", f"{details.device_name} ({details.device_os})"], + ["Runtime", f"{details.runtime.name}"], + [ + "Estimated inference time", + "less than 0.1ms" if inf_time < 0.1 else f"{inf_time}", + ], + ["Estimated peak memory usage", f"{peak_memory_bytes}"], + ["Total # Ops", f"{num_ops}"], + ["Compute Unit(s)", " ".join(compute_units)], + ] + table = PrettyTable(align="l", header=False, border=False, padding_width=0) + for row in rows: + table.add_row([row[0], f": {row[1]}"]) + print(table.get_string()) + + +def print_on_target_demo_cmd( + compile_job: hub.CompileJob, model_folder: Path, device: str +) -> None: + """ + Outputs a command that will run a model's demo script via inference job. + """ + assert compile_job.wait().success + print("\nRun this model on a hosted device on sample data using:") + target_model = compile_job.get_target_model() + assert target_model is not None + print( + f"python {model_folder / 'demo.py'} " + "--on-device " + f"--hub-model-id {target_model.model_id} " + f'--device "{device}"\n' + ) diff --git a/qai_hub_models/utils/qai_hub_helpers.py b/qai_hub_models/utils/qai_hub_helpers.py new file mode 100644 index 00000000..1ee822cf --- /dev/null +++ b/qai_hub_models/utils/qai_hub_helpers.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import os +from typing import Any, Dict, List, Union + +import numpy as np +import qai_hub as hub +from qai_hub.client import APIException, UserError + +from qai_hub_models.utils.asset_loaders import ASSET_CONFIG +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.config_loaders import QAIHMModelPerf +from qai_hub_models.utils.huggingface import fetch_huggingface_target_model +from qai_hub_models.utils.printing import print_profile_metrics + + +def transpose_channel( + io_names: str, + inputs: Union[hub.Dataset, Dict[str, Any]], + target_runtime: "TargetRuntime", + first_to_last: bool, +): + + min_dim = 4 if target_runtime == TargetRuntime.QNN else 3 + io_names_list = io_names.strip().split(",") + target = dict() + + assert isinstance(inputs, dict) + for name, array in inputs.items(): + if len(array[0].shape) < min_dim or len(array[0].shape) > 5: + target[name] = array + elif name in io_names_list: + transpose_order = list(range(len(array[0].shape))) + if first_to_last: + transpose_order.append(transpose_order.pop(-3)) + else: + transpose_order.insert(-3, transpose_order.pop(-1)) + target[name] = [np.transpose(arr, transpose_order) for arr in array] + else: + target[name] = array + return target + + +def transpose_channel_first_to_last( + io_names: str, + sample_inputs: Union[hub.Dataset, Dict[str, Any]], + target_runtime: "TargetRuntime", +) -> Dict[str, List[np.ndarray]]: + return transpose_channel(io_names, sample_inputs, target_runtime, True) + + +def transpose_channel_last_to_first( + io_names: str, + job_outputs: Union[hub.Dataset, Dict[str, Any]], + target_runtime: "TargetRuntime", +) -> Dict[str, List[np.ndarray]]: + return transpose_channel(io_names, job_outputs, target_runtime, False) + + +def can_access_qualcomm_ai_hub(): + try: + hub.get_devices() + except APIException: + return False + except UserError: + return False + return True + + +_AIHUB_URL = "https://aihub.qualcomm.com" +_AIHUB_NAME = "Qualcomm® AI Hub" +_WARNING_DASH = "=" * 114 +_INFO_DASH = "-" * 55 + + +def export_without_hub_access( + model_id: str, + model_display_name: str, + device_name: str, + skip_profiling: bool, + skip_inferencing: bool, + skip_downloading: bool, + skip_summary: bool, + output_path: str, + target_runtime: TargetRuntime, + compile_options: str, + profile_options: str, + components: List[str] | None = None, +) -> List[str] | None: + print(_WARNING_DASH) + print( + f"Unable to find a valid API token for {_AIHUB_NAME}. Using results from a previous job run on the same device.\n" + f"To get access to the complete experience, please sign-up for access at {_AIHUB_URL}." + ) + print(_WARNING_DASH) + + if compile_options or profile_options: + raise RuntimeError( + f"Jobs with `compile_options` or `profile_options` can only be run with {_AIHUB_NAME} access." + ) + + if not skip_profiling and not skip_summary: + print("") + + missing_perf = True + # Components in perf.yaml don't yet have the same name as their code generated names. + if not components: + perf_yaml_path = os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "models", + model_id, + "perf.yaml", + ) + if os.path.exists(perf_yaml_path): + parsed_perf = QAIHMModelPerf(perf_yaml_path, model_id).get_perf_details( + target_runtime, device_name + ) + missing_perf = None in parsed_perf.values() + + if not missing_perf: + print(f"Profiling Results for {model_display_name}\n{_INFO_DASH}") + for model_name, perf in parsed_perf.items(): + assert perf is not None # for mypy + print_profile_metrics(perf) + + if missing_perf: + print( + f"Cannot obtain results for Device({device_name}) with runtime {target_runtime.name} without an API token.\n" + f"Please sign-up for {_AIHUB_NAME} to get run this configuration on hosted devices." + ) + + print("") + + if not skip_inferencing and not skip_summary: + print( + f"\nSkipping on-device numerical validation. " + f"Please sign-up for {_AIHUB_NAME} to perform numerical validation on hosted devices." + ) + + paths = [] + if not skip_downloading: + print("") + print( + f"Downloading model(s) from a previous job on {_AIHUB_NAME}.\n" + f"More details are availiable on Hugging Face: {ASSET_CONFIG.get_hugging_face_url(model_display_name)}" + ) + try: + paths = fetch_huggingface_target_model( + model_display_name, output_path, target_runtime + ) + print(f"Deployable model(s) saved to: {paths}") + except Exception as e: + print(f"Download failure: {e}") + print("") + + return paths diff --git a/qai_hub_models/utils/qnn_helpers.py b/qai_hub_models/utils/qnn_helpers.py new file mode 100644 index 00000000..e15e3dd5 --- /dev/null +++ b/qai_hub_models/utils/qnn_helpers.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Dict, List + +import torch +from qai_hub.client import Job, Model, SourceModelType + + +def onnx_elem_type_to_str(elem_type: int) -> str: + if elem_type == 1: + return "float32" + elif elem_type == 2: + return "uint8" + elif elem_type == 3: + return "int8" + elif elem_type == 6: + return "int8" + elif elem_type == 10: + return "float16" + raise ValueError("Unsupported elem_type.") + + +def load_encodings(output_path: Path, model_name: str) -> Dict: + encodings_file = output_path / f"{model_name}.aimet" / f"{model_name}.encodings" + with open(encodings_file) as f: + encodings = json.load(f) + return encodings["activation_encodings"] + + +def get_qnn_inputs(compile_job: Job, sample_inputs: Dict[str, List[torch.Tensor]]): + compile_job.target_shapes + return dict(zip(compile_job.target_shapes.keys(), sample_inputs.values())) + + +def is_qnn_hub_model(model: Model): + return model.model_type in [ + SourceModelType.QNN_BIN, + SourceModelType.QNN_LIB_AARCH64_ANDROID, + SourceModelType.QNN_LIB_X86_64_LINUX, + ] diff --git a/qai_hub_models/utils/quantization.py b/qai_hub_models/utils/quantization.py new file mode 100644 index 00000000..d220200a --- /dev/null +++ b/qai_hub_models/utils/quantization.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import Optional + +import torch +from torch.utils.data import DataLoader + +from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_torch + +IMAGE_QUANTIZATION_SAMPLES_URL = CachedWebAsset.from_asset_store( + "/quantization/image_quantization_samples.pt" +) + + +def make_image_sample_data_loader() -> DataLoader: + img_tensor = get_image_quantization_samples() + tensor_dataset = torch.utils.data.TensorDataset(img_tensor) + return DataLoader(tensor_dataset, batch_size=32) + + +def get_image_quantization_samples( + quantization_samples_path: Optional[str] = None, +) -> torch.Tensor: + """ + Loads a tensor of sample input image data from the specified path. + This data is intended to be used for post-training quantization. + + If no path is provided, the method returns a default tensor containing + data from images fetched from the Google OpenImages dataset. + + The default tensor has shape (50, 3, 224, 224). Here is the code to produce + the default tensor: + + ``` + import fiftyone.zoo as foz + from PIL import Image + import torch + from qai_hub_models.models._shared.imagenet_classifier.app import preprocess_image + + image_dataset = foz.load_models_dataset( + "open-images-v6", + split="validation", + max_samples=50, + shuffle=True, + ) + + tensors = [] + for sample in image_dataset: + img = Image.open(sample.filepath) + tensors.append(preprocess_image(img)) + + final_tensor = torch.cat(tensors, dim=0) + + torch.save(final_tensor, "imagenet_quantization_samples.pt") + ``` + """ + return load_torch(quantization_samples_path or IMAGE_QUANTIZATION_SAMPLES_URL) diff --git a/qai_hub_models/utils/quantization_aimet.py b/qai_hub_models/utils/quantization_aimet.py new file mode 100644 index 00000000..936a4495 --- /dev/null +++ b/qai_hub_models/utils/quantization_aimet.py @@ -0,0 +1,275 @@ +from __future__ import annotations + +try: + from aimet_torch import onnx_utils + from aimet_torch.qc_quantize_op import QcQuantizeWrapper + from aimet_torch.quantsim import QuantizationSimModel +except (ImportError, ModuleNotFoundError): + raise NotImplementedError( + "AIMET must be installed to load quantized models. " + "Install AIMET via the instructions here: " + "https://quic.github.io/aimet-pages/releases/latest/install/index.html" + ) + +import os +import shutil +import tempfile +from pathlib import Path +from typing import Any +from zipfile import ZipFile + +import torch +from qai_hub.client import DatasetEntries + +from qai_hub_models.evaluators.base_evaluators import ( + BaseEvaluator, + _DataLoader, + _for_each_batch, +) +from qai_hub_models.utils.base_model import ( + BaseModel, + InputSpec, + SourceModelFormat, + TargetRuntime, +) +from qai_hub_models.utils.input_spec import make_torch_inputs + + +class AIMETQuantizableMixin: + """ + This mixin provides quantization support with Qualcomm's AIMET package. + """ + + def __init__( + self, + sim_model: QuantizationSimModel, + needs_onnx_direct_aimet_export: bool = False, + ): + self.quant_sim = sim_model + self.needs_onnx_direct_aimet_export = needs_onnx_direct_aimet_export + + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + if target_runtime == TargetRuntime.QNN: + return SourceModelFormat.ONNX + else: + return SourceModelFormat.TORCHSCRIPT + + def quantize( + self, + data: _DataLoader, + num_samples: int | None = None, + evaluator: BaseEvaluator | None = None, + device: str = "cpu", + requantize_model_weights=False, + ) -> float | None: + """ + Re-compute quantization encodings for this model with the given dataset and model evaluator. + + This model will be updated with a new set of quantization parameters. Future calls to + forward() and export_...() will take these quantization parameters into account. + + Parameters: + data: torch DataLoader | Collection + Data loader for the dataset to use for evaluation. + If an evaluator is __NOT__ provided (see "evaluator" parameter), the iterator must return + inputs: Collection[torch.Tensor] | torch.Tensor + + otherwise, if an evaluator __IS__ provided, the iterator must return + tuple( + inputs: Collection[torch.Tensor] | torch.Tensor, + ground_truth: Collection[torch.Tensor] | torch.Tensor] + ) + + num_samples: int | None + Number of samples to use for evaluation. One sample is one iteration from iter(data). + If none, defaults to the number of samples in the dataset. + + evaluator: BaseModelEvaluator | None + Evaluator to populate while quantizing the data. + If not provided, an evaluator is not used. + + device: str + Name of device on which inference should be run. + + requantize_model_weights: bool + If a weight is quantized, recompute its quantization parameters. + + Returns: + If an evaluator is provided, returns its accuracy score. No return value otherwise. + """ + assert isinstance(self, BaseModel) + if not evaluator: + evaluator = self.get_evaluator() + + # Enable or disable quantization for model parameters (model weights). + # Activations are always re-quantized. + for quant_module in self.quant_sim.model.modules(): + if isinstance(quant_module, QcQuantizeWrapper): + for param_quantizer in quant_module.param_quantizers.values(): + if not requantize_model_weights: + try: + param_quantizer.freeze_encoding() + except RuntimeError: + # Encoding is not set, so it can't be frozen. + pass + else: + # Un-freeze the quantizer. + param_quantizer._is_encoding_frozen = False + + # Reset evaluator if applicable + if evaluator: + evaluator.reset() + + # Define evaluator function for this model. + def evaluator_func(model: torch.nn.Module, args): + # This function is defined because AIMET does not unwrap + # the arguments you pass to `compute_encodings`. + return ( + evaluator.add_from_dataset(model, *args) + if evaluator + else _for_each_batch(model, *args) + ) + + # Compute the new encodings. + self.quant_sim.compute_encodings(evaluator_func, [data, num_samples, device]) + + # Return accuracy score if applicable + return evaluator.get_accuracy_score() if evaluator else None + + def convert_to_torchscript_and_aimet_encodings( + self, + output_dir: str | Path, + input_spec: InputSpec | None = None, + model_name: str | None = None, + ) -> str: + """ + Converts the torch module to a zip file containing an + unquantized torchscript trace and an aimet quantization encodings file. + """ + if model_name is None: + model_name = self.__class__.__name__ + if not input_spec: + input_spec = self._get_input_spec_ts() + + os.makedirs(output_dir, exist_ok=True) + zip_path = os.path.join(output_dir, f"{model_name}.aimet.zip") + base_dir = Path(f"{model_name}.aimet") + base_path = Path(output_dir) / base_dir + if base_path.exists(): + shutil.rmtree(base_path) + os.makedirs(base_path) + self.quant_sim.export( + str(base_path), + model_name, + tuple(make_torch_inputs(input_spec)), + export_to_torchscript=True, + ) + + # AIMET exports GraphModule. Convert it to ScriptModule + fx_graph_path = base_path / f"{model_name}.pth" + fx_graph = torch.load(fx_graph_path) + script_module = torch.jit.trace(fx_graph, tuple(make_torch_inputs(input_spec))) + torch.jit.save(script_module, base_path / f"{model_name}.pt") + + with ZipFile(zip_path, "w") as zip_object: + zip_object.write(base_path, base_dir) + zip_object.write( + base_path / f"{model_name}.pt", base_dir / f"{model_name}.pt" + ) + zip_object.write( + base_path / f"{model_name}_torch.encodings", + base_dir / f"{model_name}_torch.encodings", + ) + + return zip_path + + def convert_to_onnx_and_aimet_encodings( + self, + output_dir: str | Path, + input_spec: InputSpec | None = None, + model_name: str | None = None, + ) -> str: + """ + Converts the torch module to a zip file containing an + unquantized ONNX model and an aimet quantization encodings file. + """ + if model_name is None: + model_name = self.__class__.__name__ + if not input_spec: + input_spec = self._get_input_spec_ts() + + os.makedirs(output_dir, exist_ok=True) + zip_path = os.path.join(output_dir, f"{model_name}.aimet.zip") + base_dir = Path(f"{model_name}.aimet") + base_path = Path(output_dir) / base_dir + if base_path.exists(): + shutil.rmtree(base_path) + os.makedirs(base_path) + + onnx_utils.EXPORT_TO_ONNX_DIRECT = self.needs_onnx_direct_aimet_export + self.quant_sim.export( + str(base_path), + model_name, + tuple(make_torch_inputs(input_spec)), + onnx_export_args=dict(input_names=[name for name in input_spec]), + ) + + onnx_file_name = f"{model_name}.onnx" + encodings_file_name = f"{model_name}.encodings" + with ZipFile(zip_path, "w") as zip_object: + zip_object.write(base_path, base_dir) + zip_object.write( + base_path / onnx_file_name, os.path.join(base_dir, onnx_file_name) + ) + zip_object.write( + base_path / encodings_file_name, + os.path.join(base_dir, encodings_file_name), + ) + + return zip_path + + def convert_to_torchscript(*args, **kwargs): + """Block users from calling convert_to_torchscript() on quantized models, since python will call both parent classes.""" + raise NotImplementedError( + "This model is quantized. Use `model.convert_to_quantized_torchscript` instead!" + ) + + def convert_to_quantized_torchscript( + self, input_spec: InputSpec | None = None, check_trace: bool = True + ) -> Any: + """ + Converts the torch module to a quantized torchscript trace. + """ + if not input_spec: + input_spec = self._get_input_spec_ts() + + with tempfile.TemporaryDirectory() as tempdir: + self.quant_sim.export( + tempdir, + "model", + tuple(make_torch_inputs(input_spec)), + export_to_torchscript=True, + use_embedded_encodings=True, + ) + return torch.jit.load(f"{tempdir}/model_embedded.torchscript.pth") + + def get_calibration_data( + self, + target_runtime: TargetRuntime, + input_spec: InputSpec | None = None, + ) -> DatasetEntries | None: + """ + Calibration dataset for this model and input spec. + Default behavior is randomized input in range [0, 1]. + """ + if not input_spec: + input_spec = self._get_input_spec_ts() + inputs = make_torch_inputs(input_spec) + return {k: v.numpy() for k, v in zip(input_spec.keys(), inputs)} + + def _get_input_spec_ts(self, *args, **kwargs) -> InputSpec: + """Type safe version of get_input_spec.""" + assert isinstance(self, BaseModel) + return self.get_input_spec(*args, **kwargs) diff --git a/qai_hub_models/utils/testing.py b/qai_hub_models/utils/testing.py new file mode 100644 index 00000000..3d4fcbc6 --- /dev/null +++ b/qai_hub_models/utils/testing.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +import numpy as np +import pytest + +from qai_hub_models.utils.asset_loaders import always_answer_prompts + + +def skip_clone_repo_check(func): + """ + When running QAI Hub Models functions, the user sometimes needs to type "y" + before the repo is cloned. When testing in CI, we want to skip this check. + + Add this function as a decorator to any test function that needs to bypass this. + + @skip_clone_repo_check + def test_fn(): + ... + """ + + def wrapper(*args, **kwargs): + with always_answer_prompts(True): + return func(*args, **kwargs) + + return wrapper + + +@pytest.fixture +def skip_clone_repo_check_fixture(): + with always_answer_prompts(True): + yield + + +def assert_most_same(arr1: np.ndarray, arr2: np.ndarray, diff_tol: float) -> None: + """ + Checks whether most values in the two numpy arrays are the same. + + Particularly for image models, slight differences in the PIL/cv2 envs + may cause image <-> tensor conversion to be slightly different. + + Instead of using np.assert_allclose, this may be a better way to test image outputs. + + Parameters: + arr1: First input image array. + arr2: Second input image array. + diff_tol: Float in range [0,1] representing percentage of values + that can be different while still having the assertion pass. + + Raises: + AssertionError if input arrays are different size, + or too many values are different. + """ + + different_values = arr1 != arr2 + assert ( + np.mean(different_values) <= diff_tol + ), f"More than {diff_tol * 100}% of values were different." + + +def assert_most_close( + arr1: np.ndarray, + arr2: np.ndarray, + diff_tol: float, + rtol: float = 0.0, + atol: float = 0.0, +) -> None: + """ + Checks whether most values in the two numpy arrays are close. + + Particularly for image models, slight differences in the PIL/cv2 envs + may cause image <-> tensor conversion to be slightly different. + + Instead of using np.assert_allclose, this may be a better way to test image outputs. + + Parameters: + arr1: First input image array. + arr2: Second input image array. + diff_tol: Float in range [0,1] representing percentage of values + that can be different while still having the assertion pass. + atol: See rtol documentation. + rtol: Two values a, b are considered close if the following expresion is true + `absolute(a - b) <= (atol + rtol * absolute(b))` + Documentation copied from `np.isclose`. + + Raises: + AssertionError if input arrays are different size, + or too many values are not close. + """ + + not_close_values = ~np.isclose(arr1, arr2, atol=atol, rtol=rtol) + assert ( + np.mean(not_close_values) <= diff_tol + ), f"More than {diff_tol * 100}% of values were not close." diff --git a/scripts/build_and_test.py b/scripts/build_and_test.py new file mode 100755 index 00000000..b27536b0 --- /dev/null +++ b/scripts/build_and_test.py @@ -0,0 +1,612 @@ +#!/usr/bin/env python3 + +import argparse +import glob +import logging +import os +import sys +import textwrap +from typing import Callable, List, Optional + +from tasks.changes import ( + get_all_models, + get_changed_models, + get_models_to_run_general_tests, + get_models_to_test_export, + get_models_with_changed_definitions, + get_models_with_export_file_changes, +) +from tasks.constants import VENV_PATH +from tasks.github import set_github_output +from tasks.plan import ( + ALL_TASKS, + PUBLIC_TASKS, + SUMMARIZERS, + TASK_DEPENDENCIES, + Plan, + depends, + public_task, + summarizer, + task, +) +from tasks.release import ReleaseTask +from tasks.task import ( + COVERAGE_DIR, + TEST_RESULTS_DIR, + ConditionalTask, + ListTasksTask, + NoOpTask, + RunCommandsWithVenvTask, + Task, +) +from tasks.test import ( + PyTestE2eHubTask, + PyTestModelsTask, + PyTestScriptsTask, + PyTestUtilsTask, +) +from tasks.util import can_support_aimet, echo, run, run_with_venv_and_get_output +from tasks.venv import CreateVenvTask, SyncLocalQAIHMVenvTask + + +def get_coverage_reports(): + return glob.glob(os.path.join(COVERAGE_DIR, ".coverage.*")) + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Build and test all the things.", + formatter_class=argparse.RawTextHelpFormatter, + ) + + parser.add_argument( + "--task", + "--tasks", + dest="legacy_task", + type=str, + help="[deprecated] Comma-separated list of tasks to run; use --task=list to list all tasks.", + ) + parser.add_argument( + "task", + type=str, + nargs="*", + help='Task(s) to run. Specify "list" to show all tasks.', + ) + + parser.add_argument( + "--only", + action="store_true", + help="Run only the listed task(s), skipping any dependencies.", + ) + + parser.add_argument( + "--print-task-graph", + action="store_true", + help="Print the task library in DOT format and exit. Combine with --task to highlight what would run.", + ) + + parser.add_argument( + "--python", + type=str, + default="python3.8", + help="Python executable path or name (only used when creating the venv).", + ) + + parser.add_argument( + "--venv", + type=str, + metavar="...", + default=VENV_PATH, + help=textwrap.dedent( + """\ + [optional] Use the virtual env at the specified path. + - Creates a virtual env at that path if none exists. + - If omitted, creates and uses a virtual environment at """ + + VENV_PATH + + """ + - If [none], does not create or activate a virtual environment. + """ + ), + ) + + parser.add_argument( + "--dry-run", action="store_true", help="Print the plan, rather than running it." + ) + + parser.add_argument( + "--defer-coverage-report", + action="store_true", + help=textwrap.dedent( + """\ + Skip coverage report and keep coverage files. These files will + be included in subsequent runs to build_and_test.py that do not + defer the report. This helps produce a single report from a + series of separate build_and_test.py commands. + """ + ), + ) + + args = parser.parse_args() + if args.legacy_task: + args.task.extend(args.legacy_task.split(",")) + delattr(args, "legacy_task") + return args + + +class TaskLibrary: + def __init__( + self, + python_executable: str, + venv_path: Optional[str], + defer_coverage_report: bool = False, + ) -> None: + self.python_executable = python_executable + self.venv_path = venv_path + self.defer_coverage_report = defer_coverage_report + + @staticmethod + def to_dot(highlight: List[str] = []) -> str: + elements: List[str] = [] + for tsk in ALL_TASKS: + task_attrs: List[str] = [] + if tsk in PUBLIC_TASKS: + task_attrs.append("style=filled") + if tsk in highlight: + task_attrs.append("penwidth=4.0") + if len(task_attrs) > 0: + elements.append(f"{tsk} [{' '.join(task_attrs)}]") + else: + elements.append(tsk) + for tsk in TASK_DEPENDENCIES: + for dep in TASK_DEPENDENCIES[tsk]: + elements.append(f"{tsk} -> {dep}") + elements_str = "\n".join([f" {element};" for element in elements]) + return f"digraph {{\n{elements_str}\n}}" + + @public_task("Print a list of commonly used tasks; see also --task=list_all.") + @depends(["list_public"]) + def list(self, plan: Plan) -> str: + return plan.add_step("list", NoOpTask()) + + @task + def list_all(self, plan: Plan) -> str: + return plan.add_step("list_all", ListTasksTask(ALL_TASKS)) + + @task + def list_public(self, plan: Plan) -> str: + return plan.add_step("list_public", ListTasksTask(PUBLIC_TASKS)) + + @public_task("precheckin") + @depends( + [ + "test_utils", + "test_scripts", + "test_changed_models", + ] + ) + def precheckin(self, plan: Plan) -> str: + # Excludes export tests, and uses the same environment for each model. + return plan.add_step("precheckin", NoOpTask()) + + @public_task("precheckin_long") + @depends( + [ + "test_utils", + "test_scripts", + "test_changed_models_long", + ] + ) + def precheckin_long(self, plan: Plan) -> str: + # Includes export tests, and creates a fresh environment for each model. + return plan.add_step("precheckin_long", NoOpTask()) + + @public_task("all_tests") + @depends( + [ + "test_utils", + "test_scripts", + "test_all_models", + "test_e2e_on_hub", + ] + ) + def all_tests(self, plan: Plan) -> str: + return plan.add_step("all_tests", NoOpTask()) + + @public_task("all_tests_long") + @depends( + [ + "test_utils", + "test_scripts", + "test_all_models_long", + "test_e2e_on_hub", + ] + ) + def all_tests_long(self, plan: Plan) -> str: + return plan.add_step("all_tests_long", NoOpTask()) + + @task + def create_venv(self, plan: Plan, step_id: str = "create_venv") -> str: + return plan.add_step( + step_id, + ConditionalTask( + group_name=None, + condition=lambda: self.venv_path is None + or os.path.exists(self.venv_path), + true_task=NoOpTask("Not creating/activating any virtual environment."), + false_task=CreateVenvTask(self.venv_path, self.python_executable), + ), + ) + + @public_task("Install dependencies for model zoo.") + @depends(["create_venv"]) + def install_deps(self, plan: Plan, step_id: str = "install_deps") -> str: + return plan.add_step( + step_id, + SyncLocalQAIHMVenvTask( + self.venv_path, + ["dev"], + can_support_aimet(), + ), + ) + + @task + def clean_pip(self, plan: Plan) -> str: + class CleanPipTask(Task): + def __init__(self, venv_path: Optional[str]) -> None: + super().__init__("Deleting python packages") + self.venv_path = venv_path + + def does_work(self) -> bool: + return True + + def run_task(self) -> None: + if self.venv_path is not None: + # Some sanity checking to make sure we don't accidentally "rm -rf /" + if not self.venv_path.startswith(os.environ["HOME"]): + run(f"rm -rI {self.venv_path}") + else: + run(f"rm -rf {self.venv_path}") + + return plan.add_step("clean_pip", CleanPipTask(self.venv_path)) + + @public_task("Run tests for common utilities.") + @depends(["install_deps"]) + def test_utils(self, plan: Plan, step_id: str = "test_utils") -> str: + return plan.add_step(step_id, PyTestUtilsTask(self.venv_path)) + + @public_task("Run tests for common scripts.") + @depends(["install_deps"]) + def test_scripts(self, plan: Plan, step_id: str = "test_scripts") -> str: + return plan.add_step( + step_id, + PyTestScriptsTask(self.venv_path), + ) + + @public_task( + "Run most tests for only added/modified models in Model Zoo. Includes most tests, uses shared global cache, and uses the same environment for each model." + ) + @depends(["install_deps"]) + def test_changed_models( + self, plan: Plan, step_id: str = "test_changed_models" + ) -> str: + changed_model_defs = set( + get_models_with_changed_definitions() + ) # model.py changed + export_changed_models = set( + get_models_with_export_file_changes() + ) # export.py or test_generated.py changed + + # Get the set of models for which export changed and model defs changed + model_and_export_changed = changed_model_defs & export_changed_models + if len(model_and_export_changed) > 0: + # Don't bother testing all models for export. + # Just test the export for the models whose definitions changed. + export_models = model_and_export_changed + elif len(export_changed_models) > 0: + # This is true when `export.py` or `test_generated.py` are mass-changed, + # but no model definitions actually changed. That means this was a mass-change + # to the export scripts. + # + # Just use 1 model as a sample to test the export. This makes CI significantly faster. + export_models = set([next(iter(export_changed_models))]) + else: + export_models = set() + + # Set of models to run general tests + models_to_run_tests = set( + get_models_to_run_general_tests() + ) # demo.py or model.py changed + models_to_run_tests = ( + models_to_run_tests | export_models + ) # export tests can only run alongside general model tests + + return plan.add_step( + step_id, + PyTestModelsTask( + self.python_executable, + models_to_run_tests, + export_models, + self.venv_path, + venv_for_each_model=False, + use_shared_cache=True, + ), + ) + + @public_task( + "Run all tests for only added/modified models in Model Zoo. Includes all tests, and creates a fresh environment for each model." + ) + @depends(["install_deps"]) + def test_changed_models_long( + self, plan: Plan, step_id: str = "test_changed_models_long" + ) -> str: + default_test_models = ["mobilenet_v2", "googlenet"] + return plan.add_step( + step_id, + PyTestModelsTask( + self.python_executable, + get_changed_models() or default_test_models, + get_models_to_test_export() or default_test_models, + self.venv_path, + venv_for_each_model=True, + use_shared_cache=False, + ), + ) + + @public_task("Run tests for all models in Model Zoo.") + @depends(["install_deps"]) + def test_all_models(self, plan: Plan, step_id: str = "test_all_models") -> str: + # Excludes export tests, and uses the same environment for each model. + all_models = get_all_models() + return plan.add_step( + step_id, + PyTestModelsTask( + self.python_executable, + all_models, + [], + self.venv_path, + venv_for_each_model=False, + use_shared_cache=True, + ), + ) + + @public_task("Run profile jobs for all models in Model Zoo.") + @depends(["install_deps"]) + def test_profile_all_models( + self, plan: Plan, step_id: str = "test_profile_all_models" + ) -> str: + all_models = get_all_models() + return plan.add_step( + step_id, + PyTestModelsTask( + self.python_executable, + all_models, + all_models, + self.venv_path, + venv_for_each_model=False, + use_shared_cache=True, + export_func="profile", + skip_standard_unit_test=True, + ), + ) + + @public_task("Run tests for all models in Model Zoo.") + @depends(["install_deps"]) + def test_all_models_long( + self, plan: Plan, step_id: str = "test_all_models_long" + ) -> str: + # Includes export tests, and creates a fresh environment for each model. + all_models = get_all_models() + return plan.add_step( + step_id, + PyTestModelsTask( + self.python_executable, + all_models, + all_models, + self.venv_path, + venv_for_each_model=True, + use_shared_cache=False, + ), + ) + + @public_task("Run e2e tests against Hub") + @depends(["install_deps"]) + def test_e2e_on_hub(self, plan: Plan, step_id: str = "test_e2e_on_hub") -> str: + return plan.add_step( + step_id, + PyTestE2eHubTask(self.venv_path), + ) + + @summarizer + def test_report_coverage(self, plan: Plan) -> str: + defer_coverage_report = self.defer_coverage_report + + class RunCoverageTask(Task): + def __init__(self, venv_path: Optional[str]) -> None: + super().__init__("Report Coverage") + self.venv_path = venv_path + + def does_work(self) -> bool: + return True + + def run_task(self) -> None: + coverage_reports = get_coverage_reports() + all_reports = '"' + '" "'.join(coverage_reports) + '"' + RunCommandsWithVenvTask( + group_name=None, + venv=self.venv_path, + commands=[ + f"coverage combine {all_reports}", + "coverage report", + f'coverage html -d "{TEST_RESULTS_DIR}/html"', + ], + ).run() + coverage = run_with_venv_and_get_output( + self.venv_path, + "coverage report | tail -1 | sed 's/[[:blank:]]*$//;s/.*[[:blank:]]//'", + ) + set_github_output("coverage", coverage) + + class ReportCoverageTask(ConditionalTask): + def __init__(self, venv_path: Optional[str]) -> None: + super().__init__( + group_name=None, + condition=lambda: len(get_coverage_reports()) == 0 + or defer_coverage_report, + true_task=NoOpTask(), + false_task=RunCoverageTask(venv_path), + ) + + def does_work(self) -> bool: + return True + + return plan.add_step("test_report_coverage", ReportCoverageTask(self.venv_path)) + + @public_task("Release QAIHM (build repo & wheel, push repo & wheel)") + @depends(["install_deps"]) + def release(self, plan: Plan, step_id: str = "release") -> str: + return plan.add_step( + step_id, + ReleaseTask( + self.venv_path, + self.python_executable, + build_repository=True, + push_repository=True, + build_wheel=True, + publish_wheel=True, + ), + ) + + @public_task("Mock Release QAIHM (build repo & wheel, but do not push them)") + @depends(["install_deps"]) + def mock_release(self, plan: Plan, step_id: str = "mock_release") -> str: + return plan.add_step( + step_id, + ReleaseTask( + self.venv_path, + self.python_executable, + build_repository=True, + push_repository=False, + build_wheel=True, + publish_wheel=False, + ), + ) + + # This taks has no depedencies and does nothing. It will still trigger + # summarizer, so it can be used to finalize a coverage report. + @task + def nop(self, plan: Plan) -> str: + return plan.add_step("nop", NoOpTask()) + + +def plan_from_dependencies( + main_tasks: List[str], + python_executable: str, + venv_path: Optional[str], + defer_coverage_report: bool = False, +) -> Plan: + task_library = TaskLibrary( + python_executable, + venv_path, + defer_coverage_report=defer_coverage_report, + ) + plan = Plan() + + # We always run summarizers, which perform conditional work on the output + # of other steps. + work_list = SUMMARIZERS + + # The work list is processed as a stack, so LIFO. We reverse the user-specified + # tasks so that they (and their dependencies) can be expressed in a natural order. + work_list.extend(reversed(main_tasks)) + + for task_name in work_list: + if not hasattr(task_library, task_name): + echo(f"Task '{task_name}' does not exist.", file=sys.stderr) + sys.exit(1) + + while len(work_list) > 0: + task_name = work_list.pop() + unfulfilled_deps: List[str] = [] + for dep in TASK_DEPENDENCIES.get(task_name, []): + if not plan.has_step(dep): + unfulfilled_deps.append(dep) + if not hasattr(task_library, dep): + echo( + f"Non-existent task '{dep}' was declared as a dependency for '{task_name}'.", + file=sys.stderr, + ) + sys.exit(1) + if len(unfulfilled_deps) == 0: + # add task_name to plan + task_adder: Callable[[Plan], str] = getattr(task_library, task_name) + task_adder(plan) + else: + # Look at task_name again later when its deps are satisfied + work_list.append(task_name) + work_list.extend(reversed(unfulfilled_deps)) + + return plan + + +def plan_from_task_list( + tasks: List[str], + python_executable: str, + venv_path: Optional[str], + defer_coverage_report: bool = False, +) -> Plan: + task_library = TaskLibrary( + python_executable, + venv_path, + defer_coverage_report=defer_coverage_report, + ) + plan = Plan() + for task_name in tasks: + # add task_name to plan + task_adder: Callable[[Plan], str] = getattr(task_library, task_name) + task_adder(plan) + return plan + + +def build_and_test(): + log_format = "[%(asctime)s] [bnt] [%(levelname)s] %(message)s" + logging.basicConfig(level=logging.DEBUG, format=log_format) + + args = parse_arguments() + + venv_path = args.venv if args.venv != "none" else None + python_executable = args.python + + plan = Plan() + + if len(args.task) > 0: + planner = plan_from_task_list if args.only else plan_from_dependencies + plan = planner( + args.task, + python_executable, + venv_path, + defer_coverage_report=args.defer_coverage_report, + ) + + if args.print_task_graph: + print(TaskLibrary.to_dot(plan.steps)) + sys.exit(0) + elif len(args.task) == 0: + echo("At least one task or --print-task-graph is required.") + + if args.dry_run: + plan.print() + else: + caught = None + try: + plan.run() + except Exception as ex: + caught = ex + print() + plan.print_report() + print() + if caught: + raise caught + + +if __name__ == "__main__": + build_and_test() diff --git a/scripts/ci/git-credential-helper.sh b/scripts/ci/git-credential-helper.sh new file mode 100644 index 00000000..1a294a88 --- /dev/null +++ b/scripts/ci/git-credential-helper.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo username="$GIT_USER" +echo password="$GIT_PASSWORD" diff --git a/scripts/examples/conftest.py b/scripts/examples/conftest.py new file mode 100644 index 00000000..35bfc145 --- /dev/null +++ b/scripts/examples/conftest.py @@ -0,0 +1,21 @@ +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--on-device", action="store_true", default=False, help="Run on-device tests" + ) + + +def pytest_configure(config): + config.addinivalue_line("markers", "on_device: Tests running Hub inference jobs") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--on-device"): + # --on-device given in cli: do not skip on_device tests + return + skip_on_device = pytest.mark.skip(reason="need --on-device option to run") + for item in items: + if "on_device" in item.keywords: + item.add_marker(skip_on_device) diff --git a/scripts/examples/quantize_deeplabv3.py b/scripts/examples/quantize_deeplabv3.py new file mode 100644 index 00000000..57cc33b8 --- /dev/null +++ b/scripts/examples/quantize_deeplabv3.py @@ -0,0 +1,50 @@ +""" +This is a sample script showing how to take a AIMET model zoo model without +pre-computed activations, and compute those activations using QAIHM. + +This script assumes the model is added to QAIHM, but is missing quantization parameters. +""" +import argparse +import os + +from aimet_zoo_torch.deeplabv3.dataloader import get_dataloaders_and_eval_func + +from qai_hub_models.models.deeplabv3_plus_mobilenet_quantized.model import ( + MODEL_ID, + DeepLabV3PlusMobileNetQuantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +if __name__ == "__main__": + # Args + parser = argparse.ArgumentParser() + parser.add_argument( + "--voc-path", + required=True, + help="Local path to VOCdevkit/VOC2012. VOC Devkit can be found here http://host.robots.ox.ac.uk/pascal/VOC/voc2012/#devkit", + ) + parser.add_argument( + "--num-iter", type=int, default=None, help="Number of dataset iterations to use" + ) + args = parser.parse_args() + + # Load model. + train_loader, _, _ = get_dataloaders_and_eval_func(args.voc_path) + + # You can skip loading parameters in from_pretrained() if you haven't generated them yet. + m = DeepLabV3PlusMobileNetQuantizable.from_pretrained() + + # Load adaround (weight-only) encodings from the AIMET zoo + weight_encodings = CachedWebModelAsset( + "https://github.com/quic/aimet-model-zoo/releases/download/torch_dlv3_w8a8_pc/deeplabv3+w8a8_tfe_perchannel_param.encodings", + "example_scripts", + "1", + "deeplabv3+w8a8_tfe_perchannel_param.encodings", + ) + m.quant_sim.set_and_freeze_param_encodings(weight_encodings.fetch()) + + # Quantize activations + m.quantize(train_loader, args.num_iter, m.get_evaluator()) + + # Export encodings + m.convert_to_torchscript_and_aimet_encodings(os.getcwd(), model_name=MODEL_ID) diff --git a/scripts/examples/quantize_imagenet_classifier.py b/scripts/examples/quantize_imagenet_classifier.py new file mode 100644 index 00000000..d1f39ab5 --- /dev/null +++ b/scripts/examples/quantize_imagenet_classifier.py @@ -0,0 +1,64 @@ +""" +This is a sample script showing how to take a AIMET model zoo model without +pre-computed activations, and compute those activations using QAIHM. +This script assumes the model is added to QAIHM, but is missing quantization parameters. +""" +import argparse +import importlib +from pathlib import Path + +import torch +from torch.utils.data import DataLoader + +from qai_hub_models.datasets.imagenette import ImagenetteDataset + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--num-iter", type=int, default=1, help="Number of batches to use." + ) + parser.add_argument( + "--batch-size", + type=int, + default=8, + help="Batch size to use on each iteration.", + ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory where encodings should be stored. Defaults to ./build.", + ) + parser.add_argument( + "--output-name", + type=str, + default=None, + help="Encodings filename. Defaults to _encodings.", + ) + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model folder to compute encodings.", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Manual seed to ensure reproducibility for quantization.", + ) + args = parser.parse_args() + module = importlib.import_module(f"qai_hub_models.models.{args.model}") + + dataset = ImagenetteDataset() + torch.manual_seed(args.seed) + dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + + model = module.Model.from_pretrained(aimet_encodings=None) + + accuracy = model.quantize(dataloader, args.num_iter, model.get_evaluator()) + print(f"Accuracy: {accuracy * 100:.3g}%") + + output_path = args.output_dir or str(Path() / "build") + output_name = args.output_name or f"{module.MODEL_ID}_encodings" + model.quant_sim.save_encodings_to_json(output_path, output_name) diff --git a/scripts/examples/quantize_superresolution.py b/scripts/examples/quantize_superresolution.py new file mode 100644 index 00000000..0f038bfb --- /dev/null +++ b/scripts/examples/quantize_superresolution.py @@ -0,0 +1,64 @@ +""" +This is a sample script showing how to take a AIMET model zoo model without +pre-computed activations, and compute those activations using QAISM. + +This script assumes the model is added to QAISM, but is missing quantization parameters. +""" +import argparse +import importlib +from pathlib import Path + +import torch +from torch.utils.data import DataLoader + +from qai_hub_models.datasets.bsd300 import BSD300Dataset + +from qai_hub_models.utils.quantization_aimet import ( # isort: skip + AIMETQuantizableMixin, +) + +if __name__ == "__main__": + # Args + parser = argparse.ArgumentParser() + parser.add_argument( + "--num-iter", type=int, default=1, help="Number of batches to use." + ) + parser.add_argument( + "--batch-size", + type=int, + default=128, + help="Batch size to use on each iteration.", + ) + parser.add_argument( + "--model", + type=str, + default="sesr_m5_quantized", + help="Name of the model folder to compute encodings. This script expects a super resolution model with a scaling parameter, eg SESR M5 Quantized.", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Manual seed to ensure reproducibility for quantization.", + ) + args = parser.parse_args() + module = importlib.import_module(f"qai_hub_models.models.{args.model}") + + # Load dataset + dataset = BSD300Dataset(scaling_factor=module.model.SCALING_FACTOR) + torch.manual_seed(args.seed) + # Pass it to the dataloader + dataloader = DataLoader( + dataset, batch_size=args.batch_size, shuffle=True, drop_last=False + ) + + # Load model and confirm it's a quantizable type. + model = module.Model.from_pretrained(aimet_encodings=None) + assert isinstance(model, AIMETQuantizableMixin) + + # Quantize activations + accuracy = model.quantize(dataloader, args.num_iter, model.get_evaluator()) + print(f"PSNR: {accuracy}") + + # Export encodings + model.quant_sim.save_encodings_to_json(Path() / "build", module.MODEL_ID) diff --git a/scripts/examples/test_numerics_mobilenet_v2_quantized.py b/scripts/examples/test_numerics_mobilenet_v2_quantized.py new file mode 100644 index 00000000..94c2d7c2 --- /dev/null +++ b/scripts/examples/test_numerics_mobilenet_v2_quantized.py @@ -0,0 +1,173 @@ +""" +Run it with pytest --on-device +""" +from typing import Tuple + +import numpy as np +import pytest +import qai_hub as hub +import torch +from torch.utils.data import DataLoader, random_split +from tqdm import tqdm + +from qai_hub_models.datasets.imagenette import ImagenetteDataset +from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable +from qai_hub_models.utils.inference import compile_zoo_model_to_hub +from qai_hub_models.utils.measurement import get_model_size_mb + + +def on_device(func): + # Skip tests if '--on-device' is not in the command line arguments + return pytest.mark.skipif( + "'--on-device' not in sys.argv", reason="needs --on-device option to run" + )(func) + + +@pytest.fixture(scope="module") +def data_loaders(): + dataset = ImagenetteDataset() + calib_len = int(0.1 * len(dataset)) + test_len = len(dataset) - calib_len + # Deterministic random split + calib_dataset, test_dataset = random_split( + dataset, [calib_len, test_len], generator=torch.Generator().manual_seed(42) + ) + calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False) + test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) + return calib_loader, test_loader + + +@pytest.fixture(scope="module") +def test_data(data_loaders) -> Tuple[torch.Tensor, torch.Tensor, hub.Dataset]: + calib_loader, test_loader = data_loaders + num_test = 1000 + + img_batches, label_batches = [], [] + total_samples = 0 + for images, labels in tqdm(test_loader): + img_batches.append(images) + label_batches.append(labels) + total_samples += images.size(0) + if total_samples >= 1000: + break + img_test = torch.cat(img_batches, dim=0)[:num_test] + label_test = torch.cat(label_batches, dim=0)[:num_test] + input_name = list( + MobileNetV2Quantizable.from_pretrained(aimet_encodings=None) + .get_input_spec() + .keys() + )[0] + data_entries = {input_name: np.split(img_test.numpy(), img_test.shape[0])} + hub_ds = hub.upload_dataset(data_entries) + return img_test, label_test, hub_ds + + +def test_dataloader_is_deterministic(data_loaders): + """Test that the calibration-test split and the loading are deterministic""" + calib_loader, test_loader = data_loaders + img, labels = next(iter(calib_loader)) + expected_calib_labels = [701, 569, 482, 571, 482] + assert labels[:5].tolist() == expected_calib_labels + + expected_test_labels = [569, 0, 217, 571, 701] + img, labels = next(iter(test_loader)) + assert labels[:5].tolist() == expected_test_labels + + +@pytest.fixture(scope="module") +def quantized_model(data_loaders, test_data): + """ + Create encoding from calibration data and returned quantized model with + validated off-target accuracy computed on QuantSim + """ + img_test, label_test, hub_dataset = test_data + calib_loader, test_loader = data_loaders + model = MobileNetV2Quantizable.from_pretrained(aimet_encodings=None) + + # Calibration in quantization + num_calib_batches = 3 + calib_accuracy = model.quantize( + calib_loader, num_calib_batches, evaluator=model.get_evaluator() + ) + np.testing.assert_allclose(0.76, calib_accuracy, atol=0.01) + + # QuantSim evaluation on eval set + evaluator = model.get_evaluator() + + batch_size = 32 + for i in tqdm(list(range(0, img_test.size(0), batch_size)), desc="QuantSim eval"): + img_batch = img_test[i : i + batch_size] + label_batch = label_test[i : i + batch_size] + + sim_out = model(img_batch).detach() + evaluator.add_batch(sim_out, label_batch) + + sim_acc = evaluator.get_accuracy_score() + print(f"{sim_acc=}") + np.testing.assert_allclose(0.78125, sim_acc, atol=0.01) + return model + + +@on_device +@pytest.mark.parametrize( + "target_runtime,hub_needs_calib_data,expected_size_mb,expected_acc", + [ + ("onnx-tflite", False, 3.806, 0), + ("torch-tflite", False, 7.0891, 0.719), + ("onnx-qnn", False, 3.844, 0.76), + ("torch-qnn", True, 3.82, 0.7618), + ], +) +def test_make_encoding_w8a8_accuracy( + quantized_model, + data_loaders, + target_runtime, + hub_needs_calib_data, + expected_size_mb, + expected_acc, + test_data, +): + """ + 1. Export and compile quantized_model on Hub. + 2. Run inference on Hub on test. + + Note: We don't run profile job to get perf here but leave that to the score card. + """ + model = quantized_model + + img_test, label_test, hub_dataset = test_data + calib_loader, test_loader = data_loaders + + # calibration data + calibration_data = None + if hub_needs_calib_data: + # AIMET export has missing encoding and needs calibration data + num_calib_batches = 3 + calib_imgs = [] + for b, (img_calib, labels) in enumerate(iter(calib_loader)): + if b >= num_calib_batches: + break + img_np = img_calib.numpy() + calib_imgs.extend(np.split(img_np, img_np.shape[0])) + calibration_data = {list(model.get_input_spec().keys())[0]: calib_imgs} + + # On-device inference + device = hub.Device("Samsung Galaxy S23") + hub_model = compile_zoo_model_to_hub( + model=model, + device=device, + target_runtime=target_runtime, + calibration_data=calibration_data, + ) + + # Make sure model is quantized + tgt_model_size_mb = get_model_size_mb(hub_model.model) + np.testing.assert_allclose(expected_size_mb, tgt_model_size_mb, rtol=0.1) + + # Check on-device accuracy + hub_out = hub_model(hub_dataset) + evaluator = model.get_evaluator() + evaluator.add_batch(hub_out, label_test) + hub_acc = evaluator.get_accuracy_score() + print(f"{target_runtime=}, {hub_acc=}") + np.testing.assert_allclose(expected_acc, hub_acc, atol=0.01) diff --git a/scripts/examples/yolov6_evaluation.py b/scripts/examples/yolov6_evaluation.py new file mode 100644 index 00000000..42926e14 --- /dev/null +++ b/scripts/examples/yolov6_evaluation.py @@ -0,0 +1,35 @@ +""" +This is a sample script showing how to take a AIMET model zoo model without +pre-computed activations, and compute those activations using QAIHM. +This script assumes the model is added to QAIHM, but is missing quantization parameters. +Packages to install: pycocotools, object-detection-metrics==0.4.post1, shapely +""" + +from torch.utils.data import DataLoader + +from qai_hub_models.datasets.coco import CocoDataset, collate_fn +from qai_hub_models.evaluators.detection_evaluator import DetectionEvaluator +from qai_hub_models.models.yolov6.model import YoloV6 + +if __name__ == "__main__": + # Load dataset. + dataset = CocoDataset() + # Pass it to data loader + dataloader = DataLoader( + dataset, batch_size=1, shuffle=True, collate_fn=collate_fn, drop_last=False + ) + + # Load model + model = YoloV6.from_pretrained() + + # Instantiate the evaluator + evaluator = DetectionEvaluator( + image_height=640, + image_width=640, + nms_score_threshold=0.3, + nms_iou_threshold=0.5, + ) + + # Pass batches of data through the model. + evaluator.add_from_dataset(model, dataloader, eval_iterations=1000) + print(f"mAP: {evaluator.mAP:.1%}") diff --git a/scripts/quantize_ffnet.py b/scripts/quantize_ffnet.py new file mode 100644 index 00000000..cb0b0a95 --- /dev/null +++ b/scripts/quantize_ffnet.py @@ -0,0 +1,77 @@ +import argparse +from pathlib import Path + +import torch + +from qai_hub_models.models._shared.cityscapes_segmentation.app import ( + _load_cityscapes_loader, +) +from qai_hub_models.models.ffnet_40s_quantized.model import FFNet40SQuantizable +from qai_hub_models.models.ffnet_54s_quantized.model import FFNet54SQuantizable +from qai_hub_models.models.ffnet_78s_quantized.model import FFNet78SQuantizable + +FFNET_VARIANTS = { + "ffnet_40s": FFNet40SQuantizable, + "ffnet_54s": FFNet54SQuantizable, + "ffnet_78s": FFNet78SQuantizable, +} + + +""" +This is a sample script showing how to take a AIMET model zoo model without +pre-computed activations, and compute those activations using QAISM. + +This script assumes the model is added to QAISM, but is missing quantization parameters. +""" +if __name__ == "__main__": + # Args + parser = argparse.ArgumentParser() + parser.add_argument( + "--variant", + choices=FFNET_VARIANTS.keys(), + required=True, + help="FFNet variant", + ) + parser.add_argument( + "--cityscapes-path", + required=True, + help="Local path to Cityscapes (where leftImg8bit_trainvaltest.zip and gtFine_trainvaltest.zip are unzipped). Download from https://www.cityscapes-dataset.com/downloads/", + ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory where encodings should be stored. Defaults to ./build.", + ) + parser.add_argument( + "--output-name", + type=str, + default=None, + help="Encodings filename. Defaults to _encodings.", + ) + parser.add_argument( + "--num-iter", type=int, default=None, help="number of dataset iterations to use" + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Name of the model folder to compute encodings.", + ) + args = parser.parse_args() + + torch.manual_seed(args.seed) + + # Load data loader + loader = _load_cityscapes_loader(args.cityscapes_path) + + # Load model (with trained unquantized weights and without encodings) + FFNetQuantizable_cls = FFNET_VARIANTS[args.variant] + model = FFNetQuantizable_cls.from_pretrained(aimet_encodings=None) + + # Quantize weights and activations + model.quantize(loader, num_samples=args.num_iter, requantize_model_weights=True) + + output_path = args.output_dir or str(Path() / "build") + output_name = args.output_name or f"{args.variant}_quantized_encodings" + model.quant_sim.save_encodings_to_json(output_path, output_name) diff --git a/scripts/tasks/changes.py b/scripts/tasks/changes.py new file mode 100644 index 00000000..cf597cf7 --- /dev/null +++ b/scripts/tasks/changes.py @@ -0,0 +1,218 @@ +import os +from typing import Iterable, Set + +from .constants import ( + PY_PACKAGE_MODELS_ROOT, + PY_PACKAGE_RELATIVE_MODELS_ROOT, + PY_PACKAGE_RELATIVE_SRC_ROOT, + REPO_ROOT, +) +from .github import on_github +from .util import new_cd, run, run_and_get_output + + +def get_python_import_expression(filepath: str) -> str: + """ + Given a filepath, return the expression used to import the file + in other modules. + + For example, qiasm_model_zoo/models/trocr/model.py -> + qiasm_model_zoo.models.trocr.model + """ + + rel_path = os.path.relpath(filepath, PY_PACKAGE_RELATIVE_SRC_ROOT) + init_suffix = "/__init__.py" + if rel_path.endswith(init_suffix): + rel_path = rel_path[: -len(init_suffix)] + else: + rel_path = rel_path[: -len(".py")] + return rel_path.replace("/", ".") + + +def resolve_affected_models( + changed_files, + include_model: bool = True, + include_demo: bool = True, + include_export: bool = True, + include_tests: bool = True, + include_generated_tests: bool = True, +) -> Iterable[str]: + """ + Given a list of changed python files, performs a Depth-First Search (DFS) + over the qai_hub_models directory to figure out which directories were affected. + + The source nodes are the files that were directly changed, and there's + an edge from file A to file B if file B imports from file A. + + Note: If a zoo module is imported using a relative path, the dependency will not + be detected. Imports should be done using "from qai_stac_models." + in order to detect that current file depends on . + + changed_files: List of filepaths to files that changed. Paths are + relative to the root of this repository. + """ + seen: Set[str] = set() + while len(changed_files) > 0: + # Pop off stack + curr_file = changed_files.pop() + seen.add(curr_file) + + file_import = get_python_import_expression(curr_file) + grep_out = run_and_get_output( + f"grep -r --include='*.py' '{file_import}' {PY_PACKAGE_RELATIVE_SRC_ROOT}", + check=False, + ) + if grep_out.strip() == "": + continue + + # Determine which files depend on the current file, and thus + # also may be affected by the current change + # i.e. resolve the edges of the current node for DFS + dependent_files = set() + for grep_result in grep_out.strip().split("\n"): + dependent_file = grep_result.split(":")[0] + dependent_files.add(dependent_file) + + # Add new nodes to stack + for dependent_file in dependent_files: + if dependent_file not in seen: + changed_files.append(dependent_file) + + changed_models = set() + for f in seen: + if f.startswith(PY_PACKAGE_RELATIVE_MODELS_ROOT): + if not include_model and os.path.basename(f) == "model.py": + continue + if not include_export and os.path.basename(f) == "export.py": + continue + if not include_tests and os.path.basename(f) == "test.py": + continue + if ( + not include_generated_tests + and os.path.basename(f) == "test_generated.py" + ): + continue + if not include_demo and os.path.basename(f) == "demo.py": + continue + + model_name = f[len(PY_PACKAGE_RELATIVE_MODELS_ROOT) :].split("/")[1] + if os.path.exists( + os.path.join(PY_PACKAGE_MODELS_ROOT, model_name, "model.py") + ): + changed_models.add(model_name) + return changed_models + + +def get_changed_files_in_package() -> Iterable[str]: + """ + Returns the list of changed files in zoo based on git tracking. + """ + with new_cd(REPO_ROOT): + os.makedirs("build/model-zoo/", exist_ok=True) + changed_files_path = "build/changed-qaihm-files.txt" + if not on_github(): + run( + f"git diff $(git merge-base --fork-point origin/main) --name-only > {changed_files_path}" + ) + if os.path.exists(changed_files_path): + with open(changed_files_path, "r") as f: + return [ + file + for file in f.read().split("\n") + if file.startswith(PY_PACKAGE_RELATIVE_SRC_ROOT) + and file.endswith(".py") + ] + return [] + + +def get_models_to_test_export() -> Iterable[str]: + """ + The models for which to test export (i.e. compilation to .tflite). + Current heuristic is to only do this for models where model.py or + export.py changed. + """ + return get_changed_models( + include_model=True, + include_demo=False, + include_export=True, + include_tests=False, + include_generated_tests=True, + ) + + +def get_models_with_export_file_changes() -> Iterable[str]: + """ + The models for which to test export (i.e. compilation to .tflite). + Current heuristic is to only do this for models where model.py or + export.py changed. + """ + return get_changed_models( + include_model=False, + include_demo=False, + include_export=True, + include_tests=False, + include_generated_tests=True, + ) + + +def get_models_with_changed_definitions() -> Iterable[str]: + """ + The models for which to run non-generated (demo / model) tests. + """ + return get_changed_models( + include_model=True, + include_demo=False, + include_export=False, + include_tests=False, + include_generated_tests=False, + ) + + +def get_models_to_run_general_tests() -> Iterable[str]: + """ + The models for which to run non-generated (demo / model) tests. + """ + return get_changed_models( + include_model=True, + include_demo=True, + include_export=False, + include_tests=True, + include_generated_tests=False, + ) + + +def get_changed_models( + include_model: bool = True, + include_demo: bool = True, + include_export: bool = True, + include_tests: bool = True, + include_generated_tests: bool = True, +) -> Iterable[str]: + """ + Resolve which models within zoo have changed to figure which ones need to be tested. + + First figures out which files have changed and then does a recursive search + through all files that import from changed files. Then filters the final list + to model directories to know which ones that need to be tested. + + Returns a list of model IDs (folder names) that have changed. + """ + return resolve_affected_models( + get_changed_files_in_package(), + include_model, + include_demo, + include_export, + include_tests, + include_generated_tests, + ) + + +def get_all_models() -> Iterable[str]: + """ + Resolve model IDs (folder names) of all models in QAIHM. + """ + model_names = set() + for model_name in os.listdir(PY_PACKAGE_MODELS_ROOT): + if os.path.exists(os.path.join(PY_PACKAGE_MODELS_ROOT, model_name, "model.py")): + model_names.add(model_name) + return model_names diff --git a/scripts/tasks/constants.py b/scripts/tasks/constants.py new file mode 100644 index 00000000..6e28ba16 --- /dev/null +++ b/scripts/tasks/constants.py @@ -0,0 +1,30 @@ +import os + +from .util import run_and_get_output + +# Env Variable +STORE_ROOT_ENV_VAR = "QAIHM_STORE_ROOT" + +# Repository +REPO_ROOT = run_and_get_output("git rev-parse --show-toplevel") +VENV_PATH = os.path.join(REPO_ROOT, "qaihm-dev") +BUILD_ROOT = os.path.join(REPO_ROOT, "build") + +# Dependent Wheels +QAI_HUB_LATEST_PATH = os.path.join(BUILD_ROOT, "qai_hub-latest-py3-none-any.whl") + +# Package paths relative to repository root +PY_PACKAGE_RELATIVE_SRC_ROOT = "qai_hub_models" +PY_PACKAGE_RELATIVE_MODELS_ROOT = os.path.join(PY_PACKAGE_RELATIVE_SRC_ROOT, "models") + +# Absolute package paths +PY_PACKAGE_INSTALL_ROOT = REPO_ROOT +PY_PACKAGE_SRC_ROOT = os.path.join( + PY_PACKAGE_INSTALL_ROOT, PY_PACKAGE_RELATIVE_SRC_ROOT +) +PY_PACKAGE_LOCAL_CACHE = os.environ.get( + STORE_ROOT_ENV_VAR, os.path.join(os.path.expanduser("~"), ".qaihm") +) +PY_PACKAGE_MODELS_ROOT = os.path.join( + PY_PACKAGE_INSTALL_ROOT, PY_PACKAGE_RELATIVE_MODELS_ROOT +) diff --git a/scripts/tasks/github.py b/scripts/tasks/github.py new file mode 100644 index 00000000..046185e2 --- /dev/null +++ b/scripts/tasks/github.py @@ -0,0 +1,25 @@ +import os + +from .util import Colors, echo + + +def on_github(): + return "GITHUB_ACTION" in os.environ + + +def start_group(group_name): + if on_github(): + echo(f"::group::{group_name}") + else: + echo(f"{Colors.GREEN}{group_name}{Colors.OFF}") + + +def end_group(): + if on_github(): + echo("::endgroup::") + + +def set_github_output(key, value): + if on_github(): + with open(os.environ["GITHUB_OUTPUT"], "a") as fh: + print(f"{key}={value}", file=fh) diff --git a/scripts/tasks/plan.py b/scripts/tasks/plan.py new file mode 100644 index 00000000..6667fa7e --- /dev/null +++ b/scripts/tasks/plan.py @@ -0,0 +1,167 @@ +import datetime +import functools +import re +import time +from typing import Callable, Dict, List, Optional, Tuple + +from .task import Task +from .util import echo + +ALL_TASKS: List[str] = [] +PUBLIC_TASKS: List[str] = [] +TASK_DEPENDENCIES: Dict[str, List[str]] = {} +TASK_DESCRIPTIONS: Dict[str, str] = {} +SUMMARIZERS: List[str] = [] + + +def task(func): + ALL_TASKS.append(func.__name__) + return func + + +def public_task(description: str): + def add_task(func): + PUBLIC_TASKS.append(func.__name__) + TASK_DESCRIPTIONS[func.__name__] = description + task(func) + return func + + return add_task + + +def depends(deps: List[str]): + def add_dep(func): + TASK_DEPENDENCIES[func.__name__] = deps + return func + + return add_dep + + +def summarizer(func): + SUMMARIZERS.append(func.__name__) + return func + + +class Step: + """A named Task within a Plan.""" + + def __init__(self, step_id: str, task: Task): + self._step_id = step_id + self._task = task + + def __repr__(self) -> str: + return self._step_id + + @property + def step_id(self) -> str: + return self._step_id + + @property + def task(self) -> Task: + return self._task + + +class Plan: + """An ordered list of Tasks to execute.""" + + _steps: List[Step] + _skips: List[re.Pattern] + _plan_duration = Optional[datetime.timedelta] + _step_durations: List[Tuple[str, datetime.timedelta]] + + def __init__(self) -> None: + self._steps = [] + self._skips = [] + self._plan_duration = None + self._step_durations = [] + + def add_step(self, step_id: str, task: Task) -> str: + if self.count_step(step_id) > 10: + raise RuntimeError( + f"Refusing to add step '{step_id}' more than 10 times. Perhaps the planner is in an infinite loop?" + ) + self._steps.append(Step(step_id, task)) + return step_id + + def count_step(self, step_id: str) -> int: + step_count = 0 + for s in self._steps: + if s.step_id == step_id: + step_count += 1 + return step_count + + def for_each(self, func: Callable[[str, Task], None]) -> None: + for s in self._steps: + func(s.step_id, s.task) + + def has_step(self, step_id: str) -> bool: + for s in self._steps: + if s.step_id == step_id: + return True + return False + + def is_skipped(self, step_id: str) -> bool: + return any([r.match(step_id) for r in self._skips]) + + def print(self) -> None: + for step in self._steps: + step_msg = step.step_id + if not step.task.does_work(): + step_msg += " (no-op)" + if self.is_skipped(step.step_id): + step_msg += " (skipped)" + print(step_msg) + + def print_report(self) -> None: + """Print a report on how long steps in the plan took.""" + + if len(self._step_durations) < 1: + return + + step_id_lens = [len(s) for s, d in self._step_durations] + max_step_id_len = functools.reduce(lambda a, b: a if a > b else b, step_id_lens) # type: ignore + print(f"{'Step':^{max_step_id_len}} {'Duration':^14}") + print(f"{'-':-^{max_step_id_len}} {'-':-^14}") + for step_id, duration in self._step_durations: + print(f"{step_id:<{max_step_id_len}} {str(duration):<14}") + if self._plan_duration: + print(f"{'-':-^{max_step_id_len}} {'-':-^14}") + print(f"{'Total':<{max_step_id_len}} {str(self._plan_duration):<14}") + + def run(self) -> None: + start_time = time.monotonic() + + def run_task(step_id: str, task: Task) -> None: + if self.is_skipped(step_id): + echo(f"Skipping {step_id}") + else: + step_start_time = time.monotonic() + + caught: Optional[Exception] = None + try: + task.run() + except Exception as ex: + caught = ex + step_end_time = time.monotonic() + if task.does_work(): + self._step_durations.append( + ( + step_id, + datetime.timedelta(seconds=step_end_time - step_start_time), + ) + ) + if caught is not None: + raise caught + + try: + self.for_each(run_task) + finally: + end_time = time.monotonic() + self._plan_duration = datetime.timedelta(seconds=end_time - start_time) + + def skip(self, pattern: str) -> None: + self._skips.append(re.compile(pattern)) + + @property + def steps(self) -> List[str]: + return [s.step_id for s in self._steps] diff --git a/scripts/tasks/release.py b/scripts/tasks/release.py new file mode 100644 index 00000000..fc95b861 --- /dev/null +++ b/scripts/tasks/release.py @@ -0,0 +1,252 @@ +from __future__ import annotations + +import os +import pathlib +import shutil +from typing import Dict, Optional + +from .constants import BUILD_ROOT +from .task import CompositeTask +from .venv import ( + CreateVenvTask, + RunCommandsTask, + RunCommandsWithVenvTask, + SyncLocalQAIHMVenvTask, +) + +qaihm_path = pathlib.Path(__file__).parent.parent.parent / "qai_hub_models" +version_path = qaihm_path / "_version.py" +version_locals: Dict[str, str] = {} +exec(open(version_path).read(), version_locals) +__version__ = version_locals["__version__"] + +DEFAULT_RELEASE_DIRECTORY = "./build/release" +RELEASE_DIRECTORY_VARNAME = "QAIHM_RELEASE_DIR" +REMOTE_REPOSITORY_URL_VARNAME = "QAIHM_REMOTE_URL" +PYPI_VARNAME = "QAIHM_PYPI_URL" + + +def _get_release_dir(): + """Get the path to the release directory.""" + return os.environ.get(RELEASE_DIRECTORY_VARNAME, DEFAULT_RELEASE_DIRECTORY) + + +def _get_release_repository_dir(): + """Get the path to the repository root in the release directory.""" + return os.path.join(_get_release_dir(), "repository") + + +def _get_wheel_dir(): + """Get the path to the wheels folder in the release directory.""" + return os.path.join(_get_release_dir(), "wheel") + + +class ReleaseTask(CompositeTask): + """ + Create a public version of the repository. + """ + + def __init__( + self, + venv: Optional[str], + python_executable: Optional[str], + build_repository: bool = True, + push_repository: bool = True, + build_wheel: bool = True, + publish_wheel: bool = True, + ): + # Verify environment variables first + if push_repository and REMOTE_REPOSITORY_URL_VARNAME not in os.environ: + raise ValueError( + f"Specify a remote repository by setting env var '${REMOTE_REPOSITORY_URL_VARNAME}'" + ) + if publish_wheel and PYPI_VARNAME not in os.environ: + raise ValueError(f"Specify a pypi by setting env var '${PYPI_VARNAME}'") + + # Do the release + tasks = [] + if build_repository: + tasks.append(BuildPublicRepositoryTask(venv, python_executable)) + if build_wheel: + tasks.append(BuildWheelTask(venv, python_executable)) + if push_repository: + tasks.append(PushRepositoryTask()) + if publish_wheel: + tasks.append(PublishWheelTask(venv, python_executable)) + + super().__init__(f"Release QAIHM {__version__}", tasks) + + +class BuildPublicRepositoryTask(CompositeTask): + """ + Create a public version of the repository. + """ + + def __init__(self, venv: Optional[str], python_executable: Optional[str]): + tasks = [] + + if not venv: + # Create Venv + venv = os.path.join(BUILD_ROOT, "test", "release_venv") + tasks.append(CreateVenvTask(venv, python_executable)) + tasks.append(SyncLocalQAIHMVenvTask(venv, ["dev"], include_aimet=False)) + + # Setup output directories + release_dir = _get_release_dir() + repo_output_dir = _get_release_repository_dir() + if os.path.exists(repo_output_dir): + shutil.rmtree(repo_output_dir) + + # Build Public Repository + tasks.append( + RunCommandsWithVenvTask( + "Run Release Script", + venv=venv, + env=os.environ, + commands=[ + f"python qai_hub_models/scripts/build_release.py --output-dir {repo_output_dir}" + ], + ) + ) + + super().__init__(f"Build Public Repository in: {release_dir}", tasks) + + +class PushRepositoryTask(CompositeTask): + """ + Publishes the repository in the provided release directory. + If no directory is provided, assumes the release directory defined above. + """ + + def __init__(self): + tasks = [] + + # Remote URL + remote_url = os.environ.get(REMOTE_REPOSITORY_URL_VARNAME, None) + if not remote_url: + raise ValueError( + f"Specify a remote by setting envrionment variable '${REMOTE_REPOSITORY_URL_VARNAME}'" + ) + + env = os.environ.copy() + env["QAIHM_TAG"] = f"v{__version__}" + commands = [ + "git init", + "git add .", + f"git remote add origin {remote_url}", + "git checkout -b main", + ] + + # Git Credential Setup (Optional) + if "QAIHM_GIT_NAME" in env: + commands.append('git config --local user.name "${QAIHM_GIT_NAME}"') + if "QAIHM_REPO_GH_EMAIL" in env: + commands.append('git config --local user.email "${QAIHM_REPO_GH_EMAIL}"') + if "QAIHM_GIT_CRED_HELPER" in env: + commands.append( + 'git config --local credential.helper "!f() { sleep 1; ${QAIHM_GIT_CRED_HELPER}; }; f"' + ) + + commands += [ + "git commit -m $QAIHM_TAG", + # Pull from origin + "git pull origin main --rebase -X ours", + # Verify Tag does not exist + "if [ $(git tag -l '$QAIHM_TAG') ];" + "then echo 'Tag $QAIHM_TAG already exists. Aborting release.';" + "exit 1;" + "fi", + # End verify tag + "git push -u origin main", + "git tag $QAIHM_TAG", + "git push --tags", + ] + + # Push Release + tasks.append( + RunCommandsTask( + "Push Release", + env=env, + cwd=_get_release_repository_dir(), + commands=commands, + ) + ) + + super().__init__(f"Push Release to {remote_url}", tasks) + + +class BuildWheelTask(CompositeTask): + """ + Creates a wheel from the provided directory. + If no directory is provided, assumes the release directory defined above. + """ + + def __init__(self, venv: Optional[str], python_executable: Optional[str]): + tasks = [] + + if not venv: + # Create Venv + venv = os.path.join(BUILD_ROOT, "test", "release_venv") + tasks.append(CreateVenvTask(venv, python_executable)) + tasks.append(SyncLocalQAIHMVenvTask(venv, ["dev"], include_aimet=False)) + + # Build Wheel + repo_dir = _get_release_repository_dir() + wheel_dir = _get_wheel_dir() + relative_wheel_dir = os.path.relpath(wheel_dir, repo_dir) + + if os.path.exists(wheel_dir): + shutil.rmtree(wheel_dir) + + tasks.append( + RunCommandsWithVenvTask( + "Build Wheel", + venv=venv, + env=os.environ, + commands=[ + f"cd {repo_dir} && " + f"python setup.py " + f"build --build-base {relative_wheel_dir} " + f"egg_info --egg-base {relative_wheel_dir} " + f"bdist_wheel --dist-dir {relative_wheel_dir}", + ], + ) + ) + + super().__init__(f"Build Wheel to: {wheel_dir}", tasks) + + +class PublishWheelTask(CompositeTask): + """ + Releases a wheel from the provided directory. + If no directory is provided, assumes the release directory defined above. + """ + + def __init__(self, venv: Optional[str], python_executable: Optional[str]): + tasks = [] + + if not venv: + # Create Venv + venv = os.path.join(BUILD_ROOT, "test", "release_venv") + tasks.append(CreateVenvTask(venv, python_executable)) + tasks.append(SyncLocalQAIHMVenvTask(venv, ["dev"], include_aimet=False)) + + pypi = os.environ.get(PYPI_VARNAME, None) + if not pypi: + raise ValueError( + f"Set desired pypi via environment variable '${PYPI_VARNAME}'" + ) + + tasks.append( + RunCommandsWithVenvTask( + "Build Wheel", + venv=venv, + env=os.environ, + commands=[ + "pip install twine", + f"twine upload --repository-url {pypi} {os.path.join(_get_wheel_dir(), '*.whl')}", + ], + ) + ) + + super().__init__(f"Releasing Wheels in {pypi}", tasks) diff --git a/scripts/tasks/task.py b/scripts/tasks/task.py new file mode 100644 index 00000000..e8b8d2df --- /dev/null +++ b/scripts/tasks/task.py @@ -0,0 +1,257 @@ +import os +import subprocess +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Callable, Dict, List, Optional, Union + +from .github import end_group, start_group +from .util import BASH_EXECUTABLE, default_parallelism, echo, have_root + +REPO_ROOT = Path(__file__).parent.parent.parent +TEST_RESULTS_DIR = os.path.join(REPO_ROOT, "build", "test-results") +COVERAGE_DIR = os.path.join(REPO_ROOT, "build", "test-coverage") + + +class Task(ABC): + def __init__(self, group_name: Optional[str]) -> None: + self.group_name = group_name + + @abstractmethod + def does_work(self) -> bool: + """ + Return True if this task actually does something (e.g., runs commands). + """ + + @abstractmethod + def run_task(self) -> None: + """ + Entry point for implementations: perform the task's action. + """ + + def run(self) -> None: + """ + Entry point for callers: perform any startup/teardown tasks and call run_task. + """ + if self.group_name: + start_group(self.group_name) + self.run_task() + if self.group_name: + end_group() + + +class FailTask(Task): + """A Task that unconditionally fails.""" + + def __init__(self, message: str) -> None: + super().__init__(group_name=None) + self._message = message + + def does_work(self) -> bool: + return True + + def run_task(self) -> None: + raise RuntimeError(self._message) + + +class ListTasksTask(Task): + def __init__(self, tasks: List[str]) -> None: + super().__init__(group_name=None) + self.tasks = tasks + + def does_work(self) -> bool: + return False + + def run_task(self) -> None: + from . import plan + + for task_name in sorted(self.tasks): + print(task_name) + description = plan.TASK_DESCRIPTIONS.get(task_name, None) + if description: + print(f" {description}") + + +class NoOpTask(Task): + """A Task that does nothing.""" + + def __init__(self, group_name: Optional[str] = None) -> None: + super().__init__(group_name=group_name) + + def does_work(self) -> bool: + return False + + def run_task(self) -> None: + pass + + +class RunCommandsTask(Task): + """ + A Task that runs a list of commands using the shell. + """ + + def __init__( + self, + group_name: Optional[str], + commands: Union[List[str], str], + as_root: bool = False, + env: Optional[Dict[str, str]] = None, + cwd: Optional[str] = None, + ) -> None: + super().__init__(group_name) + if isinstance(commands, str): + self.commands = [commands] + else: + self.commands = commands + + if as_root and not have_root(): + self.commands = [f"sudo {c}" for c in commands] + + self.cwd = cwd + self.env = env + + def does_work(self) -> bool: + return True + + def run_task(self) -> None: + for command in self.commands: + self._run_command(command) + + def _run_command(self, command: str) -> None: + echo(f"bnt $ {command}") + subprocess.run( + command, + shell=True, + check=True, + cwd=self.cwd, + env=self.env, + executable=BASH_EXECUTABLE, + ) + + +class RunCommandsWithVenvTask(RunCommandsTask): + """ + A Task that runs a list of commands using the shell with a specific Python + virtual environment enabled. + """ + + def __init__( + self, + group_name: Optional[str], + venv: Optional[str], + commands: Union[List[str], str], + env: Optional[Dict[str, str]] = None, + ) -> None: + super().__init__(group_name, commands, env=env) + self.venv = venv + + def run_task(self) -> None: + for command in self.commands: + if self.venv is not None: + venv_command = f"source {self.venv}/bin/activate && {command}" + echo(f"bnt $ {venv_command}") + subprocess.run( + venv_command, + shell=True, + check=True, + executable=BASH_EXECUTABLE, + env=self.env, + ) + else: + self._run_command(command) + + +class PyTestTask(RunCommandsWithVenvTask): + """A task to run pytest""" + + def __init__( + self, + group_name: Optional[str], + venv: Optional[str], + files_or_dirs: str, + report_name: str, + ignore: Optional[Union[str, List[str]]] = None, + omit: Optional[Union[str, List[str]]] = None, + parallel: Optional[Union[bool, int]] = None, + extra_args: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + skip_coverage: bool = False, + ) -> None: + pytest_options = f"--name={report_name}" + + if omit is not None: + pytest_options += f" --omit={omit}" + + if ignore: + if isinstance(ignore, str): + ignore = [ignore] + ignores = [f"--ignore={i}" for i in ignore] + pytest_options += f" {' '.join(ignores)}" + + if parallel: + if isinstance(parallel, bool): + parallel = default_parallelism() + pytest_options += f" -n {parallel}" + # Don't run tests that don't support parallelism + pytest_options += ' -m "not serial"' + + pytest_options += " -ra -vvv" + + if extra_args: + pytest_options += f" {extra_args}" + + if skip_coverage: + pytest_options += " --no-cov" + + pytest_options += f" {files_or_dirs}" + + command = f"{REPO_ROOT}/scripts/util/pytest_with_coverage.sh {pytest_options} " + + super().__init__(group_name, venv, command, env) + + +class CompositeTask(Task): + """ + A Task composed of a list of other Tasks. + """ + + def __init__(self, group_name: Optional[str], tasks: List[Task]) -> None: + super().__init__(group_name) + self.tasks = tasks + + def does_work(self) -> bool: + return any([t.does_work() for t in self.tasks]) + + def run_task(self) -> None: + for task in self.tasks: + task.run() + + +class ConditionalTask(Task): + """ + A Task that runs one of two alternatives, depending on the result of + a predicate function call. + """ + + def __init__( + self, + group_name: Optional[str], + condition: Callable[[], bool], + true_task: Task, + false_task: Task, + ) -> None: + super().__init__(group_name) + self.condition = condition + self.true_task = true_task + self.false_task = false_task + + def does_work(self) -> bool: + if self.condition(): + return self.true_task.does_work() + else: + return self.false_task.does_work() + + def run_task(self) -> None: + if self.condition(): + self.true_task.run() + else: + self.false_task.run() diff --git a/scripts/tasks/test.py b/scripts/tasks/test.py new file mode 100644 index 00000000..cef5495d --- /dev/null +++ b/scripts/tasks/test.py @@ -0,0 +1,249 @@ +from __future__ import annotations + +import os +from tempfile import TemporaryDirectory +from typing import Iterable, Optional + +from .constants import ( + BUILD_ROOT, + PY_PACKAGE_MODELS_ROOT, + PY_PACKAGE_SRC_ROOT, + STORE_ROOT_ENV_VAR, +) +from .task import CompositeTask, PyTestTask, RunCommandsTask +from .util import can_support_aimet, model_needs_aimet +from .venv import ( + CreateVenvTask, + SyncLocalQAIHMVenvTask, + SyncModelRequirementsVenvTask, + SyncModelVenvTask, +) + + +class PyTestUtilsTask(PyTestTask): + """ + Pytest utils. + """ + + def __init__(self, venv: Optional[str]): + super().__init__( + "Test Utils", + venv=venv, + report_name="utils-tests", + files_or_dirs=f"{PY_PACKAGE_SRC_ROOT}/test/test_utils", + parallel=True, + ) + + +class PyTestScriptsTask(PyTestTask): + """ + Pytest scripts. + """ + + def __init__(self, venv: Optional[str]): + super().__init__( + group_name="Test Scripts", + venv=venv, + report_name="scripts-tests", + files_or_dirs=f"{PY_PACKAGE_SRC_ROOT}/scripts", + parallel=True, + ) + + +class PyTestE2eHubTask(CompositeTask): + """ + Runs e2e tests on Hub that's not specific to any model. + """ + + def __init__(self, venv: Optional[str]): + # Create temporary directory for storing cloned & downloaded test artifacts. + with TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env[STORE_ROOT_ENV_VAR] = tmpdir + + # Standard Test Suite + tasks = [ + PyTestTask( + group_name="E2e on Hub", + venv=venv, + report_name="e2e-on-hub", + files_or_dirs=f"{PY_PACKAGE_SRC_ROOT}/test/e2e/", + parallel=False, + env=env, + ) + ] + super().__init__("E2e on Hub Tests", tasks) + + +class PyTestModelTask(CompositeTask): + """ + Run all tests for a single model. + """ + + def __init__( + self, + model_name: str, + python_executable: str, + test_export: bool, + venv: str + | None = None, # If None, creates a fresh venv for each model instead of using 1 venv for all models. + use_shared_cache=False, # If True, uses a shared cache rather than the global QAIHM cache. + export_func: str = "compile", + skip_standard_unit_test: bool = False, + ): + tasks = [] + + if model_needs_aimet(model_name) and not can_support_aimet(): + tasks.append( + RunCommandsTask( + f"Skip Model {model_name}", + f'echo "Skipping Tests For Model {model_name} -- AIMET is required, but AIMET is not supported on this platform."', + ) + ) + else: + # Create test environment + if not venv: + model_venv = os.path.join(BUILD_ROOT, "test", "model_envs", model_name) + tasks.append(CreateVenvTask(model_venv, python_executable)) + # Creates a new environment from scratch + tasks.append( + SyncModelVenvTask(model_name, model_venv, include_dev_deps=True) + ) + else: + model_venv = venv + # Only install requirements.txt into existing venv + tasks.append( + SyncModelRequirementsVenvTask( + model_name, model_venv, pip_force_install=False + ) + ) + + # Create temporary directory for storing cloned & downloaded test artifacts. + with TemporaryDirectory() as tmpdir: + env = os.environ.copy() + if not use_shared_cache: + env[STORE_ROOT_ENV_VAR] = tmpdir + + # Standard Test Suite + model_dir = os.path.join(PY_PACKAGE_MODELS_ROOT, model_name) + model_test_without_export = os.path.join(model_dir, "test.py") + if ( + os.path.exists(model_test_without_export) + and not skip_standard_unit_test + ): + tasks.append( + PyTestTask( + group_name=f"Model: {model_name}", + venv=model_venv, + report_name=f"model-{model_name}-tests", + files_or_dirs=model_test_without_export, + parallel=False, + extra_args="-s", + env=env, + ) + ) + + # Export Test Suite + if test_export and os.path.isfile( + os.path.join(model_dir, "test_generated.py") + ): + tasks.append( + PyTestTask( + group_name=f"Model Export: ({model_name})", + venv=model_venv, + report_name=f"model-export-{model_name}-tests", + files_or_dirs=model_dir, + parallel=False, + extra_args=f"-s -m {export_func}", + env=env, + ) + ) + + if not venv: + tasks.append( + RunCommandsTask( + f"Remove virtual environment at {model_venv}", + f"rm -rf {model_venv}", + ) + ) + + super().__init__(f"Model Tests: {model_name}", [task for task in tasks]) + + +class PyTestModelsTask(CompositeTask): + """ + Run tests for the provided set of models. + """ + + def __init__( + self, + python_executable: str, + models_for_testing: Iterable[str], + models_to_test_export: Iterable[str], + base_test_venv: str | None = None, # Env with QAIHM installed + venv_for_each_model: bool = True, # Create a fresh venv for each model instead of using the base test venv instead. + use_shared_cache: bool = False, # Use the global QAIHM cache rather than a temporary one for tests. + export_func: str = "compile", + skip_standard_unit_test: bool = False, + ): + tasks = [] + + # Whether or not export tests will be run asynchronously + # (submit all jobs for all models at once, rather than one model at a time). + test_hub_async: bool = os.environ.get("TEST_HUB_ASYNC", 0) + + if test_hub_async and export_func == "compile": + # Clean previous (cached) compile test jobs. + tasks.append( + RunCommandsTask( + "Delete stored compile jobs from past test runs.", + f"> {os.environ['COMPILE_JOBS_FILE']}", + ) + ) + + has_venv = base_test_venv is not None + if not has_venv and (not venv_for_each_model or test_hub_async): + # Create Venv + base_test_venv = os.path.join(BUILD_ROOT, "test", "base_venv") + tasks.append(CreateVenvTask(base_test_venv, python_executable)) + tasks.append( + SyncLocalQAIHMVenvTask(base_test_venv, ["dev"], include_aimet=False) + ) + + print(f"Tests to be run for directories: {models_for_testing}") + for model_name in models_for_testing: + # Run standard test suite for this model. + tasks.append( + PyTestModelTask( + model_name, + python_executable, + model_name in models_to_test_export, + venv=None if venv_for_each_model else base_test_venv, + use_shared_cache=use_shared_cache, + export_func=export_func, + skip_standard_unit_test=skip_standard_unit_test, + ) + ) + + if test_hub_async and export_func == "compile": + # Wait for compile test jobs to finish; verify success + tasks.append( + PyTestTask( + group_name="Verify Compile Jobs Success", + venv=base_test_venv, + report_name="compile-jobs-success", + files_or_dirs=os.path.join( + PY_PACKAGE_SRC_ROOT, "test", "test_async_compile_jobs.py" + ), + parallel=False, + extra_args="-s", + ) + ) + + if not has_venv: + # Cleanup venv + tasks.append( + RunCommandsTask(base_test_venv, f"rm -rf {base_test_venv}") + ) + + super().__init__("All Per-Model Tests", [task for task in tasks]) diff --git a/scripts/tasks/util.py b/scripts/tasks/util.py new file mode 100644 index 00000000..36e3c622 --- /dev/null +++ b/scripts/tasks/util.py @@ -0,0 +1,127 @@ +import contextlib +import os +import platform +import subprocess +import sys + + +class Colors: + GREEN = "\033[0;32m" + RED = "\033[0;31m" + YELLOW = "\033[0;33m" + OFF = "\033[0m" + + +@contextlib.contextmanager +def new_cd(x): + d = os.getcwd() + + # This could raise an exception, but it's probably + # best to let it propagate and let the caller + # deal with it, since they requested x + os.chdir(x) + + try: + yield + + finally: + # This could also raise an exception, but you *really* + # aren't equipped to figure out what went wrong if the + # old working directory can't be restored. + os.chdir(d) + + +def can_support_aimet(platform: str = sys.platform) -> bool: + return ( + platform == "linux" or platform == "linux2" + ) and sys.version_info.minor == 8 # python 3.8 only + + +def model_needs_aimet(model_name: str) -> bool: + return "quantized" in model_name.lower() + + +def default_parallelism() -> int: + """A conservative number of processes across which to spread pytests desiring parallelism.""" + from .github import on_github # avoid circular import + + cpu_count = os.cpu_count() + if not cpu_count: + return 1 + + # In CI, saturate the machine + if on_github(): + return cpu_count + + # When running locally, leave a little CPU for other uses + return max(1, int(cpu_count - 2)) + + +# Convenience function for printing to stdout without buffering. +def echo(value, **args): + print(value, flush=True, **args) + + +def have_root() -> bool: + return os.geteuid() == 0 + + +def on_linux(): + return platform.uname().system == "Linux" + + +def on_mac(): + return platform.uname().system == "Darwin" + + +def process_output(command): + return command.stdout.decode("utf-8").strip() + + +def run(command): + return subprocess.run(command, shell=True, check=True, executable=BASH_EXECUTABLE) + + +def run_and_get_output(command, check=True): + return process_output( + subprocess.run( + command, + stdout=subprocess.PIPE, + shell=True, + check=check, + executable=BASH_EXECUTABLE, + ) + ) + + +def run_with_venv(venv, command, env=None): + if venv is not None: + subprocess.run( + f"source {venv}/bin/activate && {command}", + shell=True, + check=True, + executable=BASH_EXECUTABLE, + env=env, + ) + else: + run(command) + + +def run_with_venv_and_get_output(venv, command): + if venv is not None: + return process_output( + subprocess.run( + f"source {venv}/bin/activate && {command}", + stdout=subprocess.PIPE, + shell=True, + check=True, + executable=BASH_EXECUTABLE, + ) + ) + else: + return run_and_get_output(command) + + +BASH_EXECUTABLE = process_output( + subprocess.run("which bash", stdout=subprocess.PIPE, shell=True, check=True) +) diff --git a/scripts/tasks/venv.py b/scripts/tasks/venv.py new file mode 100644 index 00000000..b1331efd --- /dev/null +++ b/scripts/tasks/venv.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +import os +import subprocess +from typing import Iterable + +from .constants import ( + PY_PACKAGE_INSTALL_ROOT, + PY_PACKAGE_MODELS_ROOT, + PY_PACKAGE_SRC_ROOT, + QAI_HUB_LATEST_PATH, + REPO_ROOT, +) +from .task import CompositeTask, RunCommandsTask, RunCommandsWithVenvTask +from .util import can_support_aimet, model_needs_aimet + + +class CreateVenvTask(RunCommandsTask): + def __init__(self, venv_path: str, python_executable: str) -> None: + super().__init__( + f"Creating virtual environment at {venv_path}", + f"source {REPO_ROOT}/scripts/util/env_create.sh --python={python_executable} --venv={venv_path} --no-sync", + ) + + +def is_package_installed(package_name: str, venv_path: str | None = None) -> bool: + if venv_path is not None: + command = f'. {venv_path}/bin/activate && python -c "import {package_name}"' + else: + command = f'python -c "import {package_name}"' + + try: + subprocess.check_call(command, shell=True) + return True + except subprocess.CalledProcessError: + return False + + +class SyncLocalQAIHMVenvTask(CompositeTask): + """Sync the provided environment with local QAIHM and the provided extras.""" + + def __init__( + self, + venv_path: str | None, + extras: Iterable[str] = [], + include_aimet: bool = can_support_aimet(), + ) -> None: + tasks = [] + + # Install AIMET first to avoid installing two versions of torch (one from AIMET, one from QAIHM). + if include_aimet: + if can_support_aimet(): + if is_package_installed("aimet_torch", venv_path): + tasks.append( + RunCommandsTask( + group_name="AIMET Installation Warning", + commands=[ + 'echo "WARNING: Skipping AIMET Install because it is already installed."' + ], + ) + ) + else: + tasks.append( + RunCommandsWithVenvTask( + group_name="Install AIMET", + venv=venv_path, + commands=[ + f'"{PY_PACKAGE_SRC_ROOT}/scripts/install_aimet_cpu.sh"' + ], + ) + ) + + else: + tasks.append( + RunCommandsTask( + group_name="AIMET Installation Warning", + commands=[ + 'echo "WARNING: Skipping AIMET Install because it is not supported on this platform."' + ], + ) + ) + + qai_hub_wheel_url = os.environ.get("QAI_HUB_WHEEL_URL", None) + if not is_package_installed("qai_hub", venv_path): + if qai_hub_wheel_url is None: + if os.path.exists(QAI_HUB_LATEST_PATH): + qai_hub_wheel_url = QAI_HUB_LATEST_PATH + + if qai_hub_wheel_url: + # Install local QAI Hub wheel if it exists, instead of pulling it from PyPi. + tasks.append( + RunCommandsWithVenvTask( + group_name="Install QAI Hub (Pre-Release)", + venv=venv_path, + commands=[f'pip install "{qai_hub_wheel_url}"'], + ) + ) + + extras_str = f"[{','.join(extras)}]" if extras else "" + tasks.append( + RunCommandsWithVenvTask( + group_name=f"Install QAIHM{extras_str}", + venv=venv_path, + commands=[ + f'pip install -e "{PY_PACKAGE_INSTALL_ROOT}{extras_str}" -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.13/index.html' + ], + ) + ) + + super().__init__( + f"Create Local QAIHM{extras_str} Virtual Environment at {venv_path}", + [task for task in tasks], + ) + + +class SyncModelVenvTask(SyncLocalQAIHMVenvTask): + """Sync the provided environment with local QAIHM and the provided extras needed for the model_name.""" + + def __init__( + self, + model_name, + venv_path, + include_dev_deps: bool = False, + only_model_requirements: bool = False, + ) -> None: + extras = [] + if include_dev_deps: + extras.append("dev") + if os.path.exists( + os.path.join(PY_PACKAGE_MODELS_ROOT, model_name, "requirements.txt") + ): + extras.append(model_name) + + super().__init__( + venv_path, + extras, + model_needs_aimet(model_name), + ) + + +class SyncModelRequirementsVenvTask(RunCommandsWithVenvTask): + """Sync the provided environment with requirements from model_name's requirements.txt. + Will not re-install QAI Hub Models. Intended for speeding up CI compared to building an entirely new env for each model.""" + + def __init__(self, model_name, venv_path, pip_force_install: bool = True) -> None: + requirements_txt = os.path.join( + PY_PACKAGE_MODELS_ROOT, model_name, "requirements.txt" + ) + if os.path.exists(requirements_txt): + commands = [ + f'pip install {"--force-reinstall" if pip_force_install else None} -r "{requirements_txt}" -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.13/index.html' + ] + else: + commands = [] + + super().__init__( + group_name=f"Install Model Requirements for {model_name}", + venv=venv_path, + commands=commands, + ) diff --git a/scripts/util/common.sh b/scripts/util/common.sh new file mode 100644 index 00000000..0c7ac1d2 --- /dev/null +++ b/scripts/util/common.sh @@ -0,0 +1,99 @@ +# Common utilities + +# shellcheck disable=SC2034 # various definitions appear unused in this included source. + +REPO_ROOT=$(git rev-parse --show-toplevel) + +COLOR_GREEN='\033[0;32m' +COLOR_GREY='\033[0;37m' +COLOR_RED='\033[0;31m' +COLOR_RED_BOLD='\033[0;1;31m' +COLOR_RED_REVERSED_VIDEO='\033[0;7;31m' +COLOR_YELLOW='\033[0;33m' +COLOR_YELLOW_BOLD='\033[0;1;33m' +COLOR_OFF='\033[0m' + +FORMAT_BOLD='\033[0;1m' +FORMAT_UNDERLINED='\033[0;4m' +FORMAT_BLINKING='\033[0;5m' +FORMAT_REVERSE_VIDEO='\033[0;7m' + +# +# Emit a message to stderr. +# +function log_err() { + echo -e "${COLOR_RED_REVERSED_VIDEO}$*${COLOR_OFF}" 1>&2 +} + +# +# Emit a message to stderr. +# +function log_warn() { + echo -e "${COLOR_YELLOW_BOLD}$*${COLOR_OFF}" 1>&2 +} + +# +# Emit a message to stderr. +# +function log_info() { + echo -e "${FORMAT_BOLD}$*${COLOR_OFF}" 1>&2 +} + +# +# Emit a message to stderr. +# +function log_debug() { + echo -e "${COLOR_GREY}$*${COLOR_OFF}" 1>&2 +} + +# +# Emit a log message and exit with non-zero return. +# +function die() { + log_err "$*" + exit 1 +} + +# +# Run something as root, using sudo if necessary. +# +function run_as_root() +{ + # Don't use sudo if user is root already (e.g., in docker) + if [ "${EUID}" -eq 0 ]; then + log_debug "We're already root; running ${*} without sudo." + "${@}" + else + log_debug "We're ${EUID}; running ${*} via sudo." + sudo "${@}" + fi +} + +# +# Enable trace logging via set -x +# Args +# 1: [Default: $QAIHM_BUILD_XTRACE] If set to non-empty, enable tracing +# +# shellcheck disable=SC2120 +function set_xtrace() { + local enable="${1:-${QAIHM_BUILD_XTRACE:-}}" + if [ -n "${enable}" ]; then + set -x + fi +} + +# +# Enable bash strict mode and conditionally set -x. +# @see http://redsymbol.net/articles/unofficial-bash-strict-mode/ +# @see set_xtrace +# +function set_strict_mode() { + set -euo pipefail + set_xtrace +} + +function pretty_print_arr() { + arr=("$@") + + echo "[$(echo "${arr[@]}" | tr ' ' ',')]" +} diff --git a/scripts/util/env_create.sh b/scripts/util/env_create.sh new file mode 100755 index 00000000..b315026e --- /dev/null +++ b/scripts/util/env_create.sh @@ -0,0 +1,48 @@ +# shellcheck source=/dev/null # we are statically sourcing a script. +# This can be sourced and hence does not specify an interpreter. + +orig_flags=$- + +set -e + +# Path to the virtual environment, relative to the repository root. +ENV_PATH="qaihm-dev" + +SYNC=1 + +PYTHON="python3.8" + +# command flag options +# Parse command line configure flags ------------------------------------------ +while [ $# -gt 0 ] + do case $1 in + --venv=*) ENV_PATH=${1##--venv=} ;; + --no-sync) SYNC=0 ;; + --python=*) PYTHON=${1##--python=} ;; + *) echo "Bad opt $1." && exit 1;; + esac + shift +done + +if [ ! -d "$ENV_PATH" ]; then + mkdir -p "$(dirname "$ENV_PATH")" + + echo "Creating virtual env $ENV_PATH." + $PYTHON -m venv "$ENV_PATH" + + echo "Activating virtual env." + source "$ENV_PATH/bin/activate" +else + source "$ENV_PATH/bin/activate" + echo "Env created already. Skipping creation." +fi + +if [ $SYNC -eq 1 ]; then + source scripts/util/env_sync.sh --venv="$ENV_PATH" +fi + +# Unset -e so our shell doesn't close the next time something exits with +# non-zero status. +if [[ ! "${orig_flags}" =~ e ]]; then + set +e +fi diff --git a/scripts/util/env_sync.sh b/scripts/util/env_sync.sh new file mode 100644 index 00000000..4c6fb60d --- /dev/null +++ b/scripts/util/env_sync.sh @@ -0,0 +1,22 @@ +# This should be sourced and hence does not specify an interpreter. + +REPO_ROOT=$(git rev-parse --show-toplevel) + +. "${REPO_ROOT}/scripts/util/common.sh" + +set_strict_mode + +# Path to the virtual environment, relative to the repository root. +ENV_PATH="qaihm-dev" + +# command flag options +# Parse command line configure flags ------------------------------------------ +while [ $# -gt 0 ] + do case $1 in + --venv=*) ENV_PATH=${1##--venv=} ;; + *) echo "Bad opt $1." && exit 1;; + esac + shift +done + +python3 "${REPO_ROOT}/scripts/build_and_test.py" --venv="${ENV_PATH}" install_deps diff --git a/scripts/util/github.sh b/scripts/util/github.sh new file mode 100644 index 00000000..619c2baa --- /dev/null +++ b/scripts/util/github.sh @@ -0,0 +1,43 @@ +REPO_ROOT=$(git rev-parse --show-toplevel) + +. "${REPO_ROOT}/scripts/util/common.sh" + +GITHUB_ACTION=${GITHUB_ACTION:-} + +on_ci() { + if [ -n "${GITHUB_ACTION}" ]; then + echo "1" + fi +} + +start_group() { + group_name=$1 + + if [ -n "$GITHUB_ACTION" ]; then + echo "::group::$group_name" + else + echo -e "${COLOR_GREEN}$group_name${COLOR_OFF}" + fi +} + +end_group() { + if [ -n "$GITHUB_ACTION" ]; then + echo "::endgroup::" + fi +} + +set_github_output() { + if [ -n "$GITHUB_ACTION" ]; then + echo "$1=$2" >> "$GITHUB_OUTPUT" + fi +} + +warn() { + message=$1 + + if [ -n "$GITHUB_ACTION" ]; then + echo "::warning::$message" + else + echo -e "${COLOR_RED}$message${COLOR_OFF}" + fi +} diff --git a/scripts/util/make_coverage_config.py b/scripts/util/make_coverage_config.py new file mode 100644 index 00000000..1f3e3e05 --- /dev/null +++ b/scripts/util/make_coverage_config.py @@ -0,0 +1,35 @@ +import argparse +import configparser + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--base", type=str, help="Use this coveragerc as a base and add to it." + ) + parser.add_argument("--omit", type=str, help="Comma-separate omit directories") + parser.add_argument("--data_file", type=str, help="Output coverage data file") + parser.add_argument( + "-o", "--output", type=str, help="Save new coveragerc to this folder." + ) + + args = parser.parse_args() + + orig_coveragerc = args.base + new_coveragerc = args.output + omit = args.omit.split(",") + data_file = args.data_file + + config = configparser.ConfigParser() + config.read(orig_coveragerc) + cur_omit = config.get("run", "omit").split(",") + if data_file is not None: + config.set("run", "data_file", data_file) + if omit is not None: + config.set("run", "omit", ",".join(cur_omit + omit)) + with open(new_coveragerc, "w") as f: + config.write(f) + + +if __name__ == "__main__": + main() diff --git a/scripts/util/pytest_with_coverage.sh b/scripts/util/pytest_with_coverage.sh new file mode 100755 index 00000000..7863f986 --- /dev/null +++ b/scripts/util/pytest_with_coverage.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +REPO_ROOT=$(git rev-parse --show-toplevel) + +# Load helpers +. "${REPO_ROOT}/scripts/util/common.sh" +. "${REPO_ROOT}/scripts/util/github.sh" + +set_strict_mode + + +print_help() { + echo "pytest_with_coverage.sh --name=[...] --omit=[...] PYTEST_ARGS" + echo "" + echo "--name=[...] Test report name." + echo "--omit=[...] Comma-seprated directories." +} + +NAME="unnamed" +OMIT="" + +for i in "$@"; do + case $i in + --name=*) + NAME="${i#*=}" + shift + ;; + --omit=*) + OMIT="${i#*=}" + shift + ;; + -h|--help) + print_help + shift + exit 0 + ;; + *) + ;; + esac +done + + +COV_CONFIG="$(mktemp).coveragerc" +COVERAGE_DIR="${REPO_ROOT}/build/test-coverage" +RESULTS_DIR="${REPO_ROOT}/build/test-results" + +mkdir -p "$COVERAGE_DIR" "$RESULTS_DIR" + +DATA_FILE="${COVERAGE_DIR}/.coverage.${NAME}" +JUNIT_REPORT="${RESULTS_DIR}/${NAME}.xml" + +python "${REPO_ROOT}/scripts/util/make_coverage_config.py" \ + --base "${REPO_ROOT}/.coveragerc" \ + --data_file "${DATA_FILE}" \ + --omit "${OMIT}" \ + --output "${COV_CONFIG}" + +# Coverage can be turned off by passing `--no-cov` as part of $@ +pytest \ + -rxXs \ + -p no:warnings \ + --junitxml="${JUNIT_REPORT}" \ + --durations=20 \ + --durations-min=0.5 \ + --cov \ + --cov-report= \ + --cov-config="${COV_CONFIG}" \ + "$@" diff --git a/scripts/util/run_mypy.sh b/scripts/util/run_mypy.sh new file mode 100755 index 00000000..5c4d8f98 --- /dev/null +++ b/scripts/util/run_mypy.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# shellcheck source=/dev/null + +REPO_ROOT=$(git rev-parse --show-toplevel) + +. "${REPO_ROOT}/scripts/util/common.sh" + +set_strict_mode + + +cd "$(dirname "$0")/../.." + +venv="${VENV_PATH:-qaihm-dev}" +echo "Activating venv in ${venv}" +source "${venv}/bin/activate" + +paths=(qai_hub_models) +for path in "${paths[@]}"; do + pathToCheck="${path}" + echo "Running mypy on ${pathToCheck}" + mypy --warn-unused-configs --config-file="${REPO_ROOT}/mypy.ini" "${pathToCheck}" +done diff --git a/scripts/util/write_changed_files.py b/scripts/util/write_changed_files.py new file mode 100755 index 00000000..10912794 --- /dev/null +++ b/scripts/util/write_changed_files.py @@ -0,0 +1,45 @@ +import argparse +import os + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--files", + type=str, + help="Files that were changed in the commits of the PR.", + required=True, + ) + parser.add_argument( + "--path", type=str, help="Path for the file to be created.", required=True + ) + + args = parser.parse_args() + list2d_filenames = args.files + + # We get back a two-dimensional array, with a list of + # changed files for each commit that has been traced back. + # For this usecase, we need changed files in the commit so + # flattening and deduplicating it. + list2d_filenames = [ + "".join(unsanitized_filenames) + for unsanitized_filenames in list2d_filenames.split(",") + if unsanitized_filenames != "" + ] + flattened_filenames = [ + sanitized_filenames.replace("[", "").replace("]", "") + for sanitized_filenames in list2d_filenames + ] + flattened_filenames = list(set(flattened_filenames)) + filenames = [] + for filename in flattened_filenames: + _, ext = os.path.splitext(filename) + # Avoid running for yaml and md files. + if ext not in {".yaml", ".md"}: + filenames.append(filename) + + filenames = "\n".join(filenames) + + # Make the directory if not present. + os.makedirs(os.path.dirname(args.path), exist_ok=True) + with open(args.path, mode="wt", encoding="utf-8") as file: + file.write(filenames) diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..c78aafee --- /dev/null +++ b/setup.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# +# Copyright (c) 2024, Qualcomm® Technologies, Inc. All rights reserved. +# + +import pathlib +from typing import Dict + +from setuptools import find_packages, setup + +r_file = "requirements.txt" + +qaihm_path = pathlib.Path(__file__).parent / "qai_hub_models" +qaihm_dir = qaihm_path / "models" +requirements_path = qaihm_path / r_file + +version_path = qaihm_path / "_version.py" +version_locals: Dict[str, str] = {} +exec(open(version_path).read(), version_locals) + +# Extras dictionary definition. +extras_require = { + "dev": [ + line.strip() for line in open(qaihm_path / "requirements-dev.txt").readlines() + ] +} + +# Create extra for every model that requires one. +for model_dir in qaihm_dir.iterdir(): + if not model_dir.is_file() and (model_dir / r_file).exists(): + extra_with_dash = model_dir.name.replace("_", "-") + reqs = [line.strip() for line in open(model_dir / r_file).readlines()] + extras_require[model_dir.name] = reqs + extras_require[extra_with_dash] = reqs + + +description = "Models optimized for export to run on device." +long_description = (pathlib.Path(__file__).parent / "README.md").read_text() +setup( + name="qai_hub_models", + version=version_locals["__version__"], + description=description, + long_description=long_description, + long_description_content_type="text/markdown", + author="Qualcomm® Technologies, Inc.", + url="https://github.com/quic/ai-hub-models", + packages=find_packages(), + python_requires=">=3.8, <3.11", + package_data={ + "qai_hub_models": ["**/*.yaml", "**/*.txt", "**/*.json", "**/*.diff"] + }, + include_package_data=True, + install_requires=[line.strip() for line in open(requirements_path).readlines()], + extras_require=extras_require, + license="MIT", +)