Export preprocess

pytorch · Jul 29, 2024 · 9ae9c34 · 9ae9c34
1 parent 898670f
commit 9ae9c34
Show file tree

Hide file tree

Showing 5 changed files with 212 additions and 107 deletions.
diff --git a/tests/torchtune/models/clip/test_clip_image_transform.py b/tests/torchtune/models/clip/test_clip_image_transform.py
@@ -10,7 +10,7 @@
 
 import torch
 
-from torchtune.models.clip._transforms import CLIPImageTransform
+from torchtune.models.clip._transforms import CLIPImageTransform, ImageTransformConfig
 
 
 class TestPipelines:
@@ -41,16 +41,20 @@ class TestPipelines:
     )
     def test_clip_image_transform(self, params):
 
-        image_transform = CLIPImageTransform(
+        config = ImageTransformConfig(
             image_mean=None,
             image_std=None,
             tile_size=224,
             possible_resolutions=None,
             max_num_tiles=4,
             resample="bilinear",
             resize_to_max_canvas=params["resize_to_max_canvas"],
+            max_upscaling_size=None,
+            normalize=False,
         )
 
+        image_transform = CLIPImageTransform(config=config)
+
         image_size = params["image_size"]
 
         # Create a random image

diff --git a/tests/torchtune/modules/transforms/test_tile_crop.py b/tests/torchtune/modules/transforms/test_tile_crop.py
@@ -8,7 +8,7 @@
 
 import torch
 
-from torchtune.modules.transforms import tile_crop
+from torchtune.modules.transforms import TileCrop
 
 
 class TestTransforms:
@@ -55,7 +55,7 @@ def test_tile_crop(self, params):
         image = torch.rand(*image_size)  # Create a random image tensor
 
         if status == "Passed":
-            tiles = tile_crop(image, tile_size)
+            tiles = TileCrop()(image, tile_size)
             expected_output_shape = params["expected_output_shape"]
             assert (
                 tiles.shape == expected_output_shape
@@ -73,7 +73,7 @@ def test_tile_crop(self, params):
 
         elif status == "Failed":
             with pytest.raises(Exception) as exc_info:
-                tile_crop(image, tile_size)
+                TileCrop()(image, tile_size)
             expected_error = params["error"]
             actual_error = str(exc_info.value)
             assert (

diff --git a/torchtune/models/clip/_transforms.py b/torchtune/models/clip/_transforms.py
@@ -5,24 +5,129 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
+import math
+from dataclasses import dataclass
 from typing import Any, List, Mapping, Optional, Tuple
 
 import torch
-import torchvision
 from PIL import Image
 
 from torchtune.modules.transforms import (
     find_supported_resolutions,
     get_canvas_best_fit,
-    resize_with_pad,
-    tile_crop,
+    TileCrop,
 )
+from torchvision.transforms import v2
+from torchvision.transforms._functional_tensor import resize
 
 from torchvision.transforms.v2 import functional as F
 
 logger = logging.getLogger(__name__)
 
 
+@dataclass
+class ImageTransformConfig:
+    """
+    image_mean (Optional[List[float]]): Mean values of each channel, used for normalization.
+        Should be the same used for the pre-trained model. If None, no normalization is performed. Default None.
+    image_std (Optional[List[float]]): Standard deviation values of each channel, used for normalization.
+        Should be the same used for the pre-trained model. If None, no normalization is performed. Default None.
+    possible_resolutions (Optional[List[Tuple[int, int]]]): List of possible resolutions as tuples (height, width).
+        where each tuple represents a possible canvas to fit the image into when calling ``get_canvas_best_fit``.
+        If None, this will be calculated using max_num_tiles and tile_size. Default None.
+    tile_size (int): Size of the tiles to divide the image into. Default 224.
+    max_num_tiles (Optional[int]): Only used if possible_resolutions is NOT given.
+        Maximum number of tiles to break an image into.
+        This will be used to generate possible_resolutions,
+        e.g. [(224, 224), (224, 448), (448, 224)] if max_num_tiles = 2 and tile_size = 224.
+        Default 4.
+    resample (str): Resampling method used when resizing images. Supports any enum of
+        ``torchvision.transforms.InterpolationMode``, e.g. "nearest", "nearest_exact", "bilinear", "bicubic".
+        Default 'bilinear'.
+    resize_to_max_canvas (bool): If true, the image will be upscaled without distortion to fit the largest possible
+        resolution from possible_resolutions.
+        If False, it will pick the resolution that minimizes downscaling, including no downscaling at all.
+        In this case, the image will only be upscaled if it's size < tile_size. Default False.
+    max_upscaling_size (int): Limit the upscaling performed on the image.
+    normalize (int): If true, image will be normalized using image_mean and image_std.
+    """
+
+    image_mean: Optional[List[float]] = None
+    image_std: Optional[List[float]] = None
+    possible_resolutions: Optional[List[Tuple[int, int]]] = None
+    tile_size: int = 224
+    max_num_tiles: Optional[int] = 4
+    resample: str = "bilinear"
+    resize_to_max_canvas: bool = False
+    max_upscaling_size: Optional[int] = None
+    normalize: bool = True
+
+
+class CLIPImageTransformCore(torch.nn.Module):
+    def __init__(self, config: ImageTransformConfig):
+        super().__init__()
+        self.config = config
+
+    """
+    Performs the core transformations involved in CLIPImageTransform;
+    1. Resize the image to target_size.
+    2. Pad the image to canvas_size.
+    3. Normalize the image using image_mean and image_std.
+    4. Split the image into n * channels * tile_size * tile_size.
+
+    Args:
+        image (torch.Tensor): image as a 3D tensor in form [C, H, W].
+        target_size (torch.Tensor): tensor of shape [1, 2] containing the target_height and target_width for resize.
+        canvas_size (torch.Tensor): tensor of shape [1, 2] containing the canvas_height and canvas_width for padding.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Image tensor of shape [n, channels, config.tile_size, config.tile_size]
+            and aspect ratio tensor of shape [2, 2].
+    """
+
+    def forward(
+        self, image: torch.Tensor, target_size: torch.Tensor, canvas_size: torch.Tensor
+    ):
+        # Resize.
+        ts0, ts1 = target_size.tolist()
+        torch._check(ts0 >= 2)
+        torch._check(ts0 <= 4000)
+        torch._check(ts1 >= 2)
+        torch._check(ts1 <= 4000)
+
+        image = resize(
+            image,
+            size=[ts0, ts1],
+            interpolation=self.config.resample,
+            antialias=True,
+        )
+
+        # Pad.
+        cs0, cs1 = canvas_size.tolist()
+        torch._check(cs0 >= 2)
+        torch._check(cs0 <= 4000)
+        torch._check(cs1 >= 2)
+        torch._check(cs1 <= 4000)
+        sizes = [3, cs0, cs1]
+
+        padding = [0, cs0 - ts0, 0, cs1 - ts1]
+        output = v2.Pad(padding=padding)(image)
+
+        # Normalize.
+        if self.config.normalize:
+            output = v2.functional.normalize(
+                output, self.config.image_mean, self.config.image_std
+            )
+
+        # Split.
+        tiles = TileCrop()(output, self.config.tile_size)
+
+        # Calculate aspect ratio.
+        aspect_ratio = canvas_size // self.config.tile_size
+
+        return tiles, aspect_ratio
+
+
 class CLIPImageTransform:
     """
     This class accepts images of any size and dynamically resizes, pads, normalizes and tiles it
@@ -36,42 +141,24 @@ class CLIPImageTransform:
     For example, if an input image is of size 300x800, and we want to allow
     a maximum of 16 image tiles, with side 224px, then:
 
-    If ``resize_to_max_canvas=False``, then:
-    best_resolution = (448, 896) -> smallest canvas, up to 16 tiles, that doesn't require downscaling
+    If ``config.resize_to_max_canvas=False``, then:
+    canvas_size = (448, 896) -> smallest canvas, up to 16 tiles, that doesn't require downscaling
     image is NOT resized
     image is padded (300, 800) -> 448,896
     Image is tiled 2x4, for a final output shape of (8, 3, 224, 224)
 
-    If ``resize_to_max_canvas=True``, then:
-    best_resolution = (448, 1344) # canvas that allows maximum upscaling, with minimum padding, up to 16 tiles
+    If ``config.resize_to_max_canvas=True``, then:
+    canvas_size = (448, 1344) # canvas that allows maximum upscaling, with minimum padding, up to 16 tiles
     image is resized without distortion (300,800) -> (448, 1194) #448 is the limiting side for the resize
     image is padded (448, 1194) -> (448, 1344)
     Image is tiled 2x5, for a final output shape of (10, 3, 224, 224)
 
     Args:
-        image_mean (Optional[List[float]]): Mean values of each channel, used for normalization.
-            Should be the same used for the pre-trained model. If None, no normalization is performed. Default None.
-        image_std (Optional[List[float]]): Standard deviation values of each channel, used for normalization.
-            Should be the same used for the pre-trained model. If None, no normalization is performed. Default None.
-        possible_resolutions (Optional[List[Tuple[int, int]]]): List of possible resolutions as tuples (height, width).
-            where each tuple represents a possible canvas to fit the image into when calling ``get_canvas_best_fit``.
-            If None, this will be calculated using max_num_tiles and tile_size. Default None.
-        tile_size (int): Size of the tiles to divide the image into. Default 224.
-        max_num_tiles (Optional[int]): Only used if possible_resolutions is NOT given.
-            Maximum number of tiles to break an image into.
-            This will be used to generate possible_resolutions,
-            e.g. [(224, 224), (224, 448), (448, 224)] if max_num_tiles = 2 and tile_size = 224.
-            Default 4.
-        resample (str): Resampling method used when resizing images. Supports any enum of
-            ``torchvision.transforms.InterpolationMode``, e.g. "nearest", "nearest_exact", "bilinear", "bicubic".
-            Default 'bilinear'.
-        resize_to_max_canvas (bool): "If True, the image will be upscaled without distortion to fit the largest possible
-            resolution from possible_resolutions.
-            If False, it will pick the resolution that minimizes downscaling, including no downscaling at all.
-            In this case, the image will only be upscaled if it's size < tile_size. Default False.
+        config (Optional[ImageTransformConfig]): config values for the image transform. If None, uses the default values.
 
     Examples:
-        >>> image_transform = CLIPImageTransform(
+        >>>
+        >>> config = ImageTransformConfig(
         ...    image_mean=None,
         ...    image_std=None,
         ...    tile_size=224,
@@ -80,6 +167,7 @@ class CLIPImageTransform:
         ...    resample="bilinear",
         ...    resize_to_max_canvas=True,
         ...)
+        >>> image_transform = CLIPImageTransform(config=config)
         >>> # create random image
         >>> image = (np.random.rand(100,200,3) * 255).astype(np.uint8)
         >>> image = PIL.Image.fromarray(image)
@@ -92,48 +180,43 @@ class CLIPImageTransform:
 
     def __init__(
         self,
-        image_mean: Optional[List[float]] = None,
-        image_std: Optional[List[float]] = None,
-        possible_resolutions: Optional[List[Tuple[int, int]]] = None,
-        tile_size: int = 224,
-        max_num_tiles: Optional[int] = 4,
-        resample: str = "bilinear",
-        resize_to_max_canvas: bool = False,
+        config: Optional[ImageTransformConfig] = None,
     ) -> None:
+        if config is None:
+            config = ImageTransformConfig()
 
         # get_canvas_best_fit
         assert (
-            possible_resolutions is not None or max_num_tiles is not None
-        ), f"Either possible_resolutions or max_num_tiles must be given. Got {possible_resolutions=} and {max_num_tiles=}"
+            config.possible_resolutions is not None or config.max_num_tiles is not None
+        ), (
+            "Either possible_resolutions or max_num_tiles must be given."
+            + f"Got {config.possible_resolutions=} and {config.max_num_tiles=}"
+        )
 
         # If possible_resolutions are not given, then calculate possible ones based on max_num_tiles
-        if not possible_resolutions and max_num_tiles:
+        if not config.possible_resolutions and config.max_num_tiles:
             possible_resolutions = find_supported_resolutions(
-                max_num_tiles=max_num_tiles, tile_size=tile_size
+                max_num_tiles=config.max_num_tiles, tile_size=config.tile_size
+            )
+            config.possible_resolutions = torch.tensor(possible_resolutions).reshape(
+                -1, 2
             )
-        else:
-            possible_resolutions = possible_resolutions
 
-        self.possible_resolutions = torch.tensor(possible_resolutions).reshape(-1, 2)
         logger.info(
-            f"Found possible_resolutions: {self.possible_resolutions}. Will fit the images into the canvas with best fit."
+            f"Found possible_resolutions: {config.possible_resolutions}. Will fit the images into the canvas with best fit."
         )
 
-        self.resize_to_max_canvas = resize_to_max_canvas
-
         # normalize
-        assert (image_mean is None) == (
-            image_std is None
-        ), f"Need to provide both or none of image_mean and image_std. Got {image_mean=} and {image_std=}"
-        self.image_mean = image_mean
-        self.image_std = image_std
+        assert (config.image_mean is None) == (
+            config.image_std is None
+        ), f"Need to provide both or none of image_mean and image_std. Got {config.image_mean=} and {config.image_std=}"
 
         # resize_with_pad
-        self.max_upscaling_size = None if resize_to_max_canvas else tile_size
-        self.resample = torchvision.transforms.InterpolationMode[resample.upper()]
+        config.max_upscaling_size = (
+            None if config.resize_to_max_canvas else config.tile_size
+        )
 
-        # tile_crop
-        self.tile_size = tile_size
+        self.config = config
 
     def __call__(self, *, image: Image.Image, **kwargs) -> Mapping[str, Any]:
 
@@ -144,35 +227,49 @@ def __call__(self, *, image: Image.Image, **kwargs) -> Mapping[str, Any]:
             F.grayscale_to_rgb_image(F.to_image(image)), scale=True
         )
 
-        # Find the best canvas to fit the image without distortion
-        best_resolution = get_canvas_best_fit(
+        # Find the best canvas to fit the image without distortion.
+        # This finds the resolution of the best tile arrangement given the
+        # image dimensions, aspect ratio, and config.num_tiles.
+        canvas_size = get_canvas_best_fit(
             image=image_tensor,
-            possible_resolutions=self.possible_resolutions,
-            resize_to_max_canvas=self.resize_to_max_canvas,
+            possible_resolutions=self.config.possible_resolutions,
+            resize_to_max_canvas=self.config.resize_to_max_canvas,
         )
 
-        # resize without distortion + pad to fit best_resolution
-        image_tensor = resize_with_pad(
-            image=image_tensor,
-            target_size=best_resolution,
-            resample=self.resample,
-            max_upscaling_size=self.max_upscaling_size,
-        )
-
-        # Normalize
-        if self.image_mean and self.image_std:
-            image_tensor = F.normalize(
-                image_tensor, mean=self.image_mean, std=self.image_std
+        # Find the target size for the image. The canvas may have a different
+        # aspect ratio to the image; find the largest image that can fit within the
+        # canvas size while preserving aspect ratio. The resized image will later be
+        # padded to canvas size.
+        image_height, image_width = image_tensor.shape[-2:]
+        # If target_size requires upscaling, we might want to limit the upscaling to max_upscaling_size.
+        if self.config.max_upscaling_size is not None:
+            target_height = min(
+                max(image_height, self.config.max_upscaling_size), canvas_size[0]
+            )
+            target_width = min(
+                max(image_width, self.config.max_upscaling_size), canvas_size[1]
             )
+            target_size = (target_height, target_width)
+        else:
+            target_size = canvas_size
+
+        # Target size that preserves aspect ratio.
+        scale_h = target_size[0] / image_height
+        scale_w = target_size[1] / image_width
 
-        # Divide the image into equally sized tiles
-        image_tensor = tile_crop(image=image_tensor, tile_size=self.tile_size)
+        new_target_height = min(math.floor(image_height * scale_w), target_size[0])
+        new_target_width = min(math.floor(image_width * scale_h), target_size[1])
 
-        aspect_ratio = torch.tensor(best_resolution).reshape(-1) // self.tile_size
+        # Call ClipImageTransformCore to perform resize, pad and split transforms.
+        tiles, aspect_ratio = CLIPImageTransformCore(self.config)(
+            image=image_tensor,
+            target_size=torch.tensor([new_target_height, new_target_width]),
+            canvas_size=torch.tensor([canvas_size[0], canvas_size[1]]),
+        )
 
         kwargs.update(
             {
-                "image": image_tensor,
+                "image": tiles,
                 "aspect_ratio": aspect_ratio,
             }
         )