axially-agnostic pixdim scaling for medical files that require it

v7labs · Jan 9, 2025 · 1c26b7f · 1c26b7f
1 parent 90c3fc4
commit 1c26b7f
Show file tree

Hide file tree

Showing 6 changed files with 257 additions and 73 deletions.
diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py
@@ -996,24 +996,6 @@ def import_annotation(self, item_id: ItemId, payload: Dict[str, Any]) -> None:
         """
         ...
 
-    @abstractmethod
-    def _get_remote_files_that_require_legacy_scaling(self) -> List[Path]:
-        """
-        Get all remote files that have been scaled upon upload. These files require that
-        NifTI annotations are similarly scaled during import
-
-        Parameters
-        ----------
-        dataset : RemoteDataset
-            The remote dataset to get the files from
-
-        Returns
-        -------
-        List[Path]
-            A list of full remote paths of dataset items that require NifTI annotations to be scaled
-        """
-        ...
-
     @property
     def remote_path(self) -> Path:
         """Returns an URL specifying the location of the remote dataset."""

diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py
@@ -11,7 +11,6 @@
     Tuple,
     Union,
 )
-import numpy as np
 from pydantic import ValidationError
 from requests.models import Response
 
@@ -873,51 +872,6 @@ def register_multi_slotted(
         print(f"Reistration complete. Check your items in the dataset: {self.slug}")
         return results
 
-    def _get_remote_files_that_require_legacy_scaling(
-        self,
-    ) -> Dict[str, Dict[str, Any]]:
-        """
-        Get all remote files that have been scaled upon upload. These files require that
-        NifTI annotations are similarly scaled during import.
-
-        The in-platform affines are returned for each legacy file, as this is required
-        to properly re-orient the annotations during import.
-
-        Parameters
-        ----------
-        dataset : RemoteDataset
-            The remote dataset to get the files from
-
-        Returns
-        -------
-        Dict[str, Dict[str, Any]]
-            A dictionary of remote file full paths to their slot affine maps
-        """
-        remote_files_that_require_legacy_scaling = {}
-        remote_files = self.fetch_remote_files(
-            filters={"statuses": ["new", "annotate", "review", "complete", "archived"]}
-        )
-        for remote_file in remote_files:
-            if not remote_file.slots[0].get("metadata", {}).get("medical", {}):
-                continue
-            if not (
-                remote_file.slots[0]
-                .get("metadata", {})
-                .get("medical", {})
-                .get("handler")
-            ):
-                slot_affine_map = {}
-                for slot in remote_file.slots:
-                    slot_affine_map[slot["slot_name"]] = np.array(
-                        slot["metadata"]["medical"]["affine"],
-                        dtype=np.float64,
-                    )
-                remote_files_that_require_legacy_scaling[
-                    Path(remote_file.full_path)
-                ] = slot_affine_map
-
-        return remote_files_that_require_legacy_scaling
-
 
 def _find_files_to_upload_as_multi_file_items(
     search_files: List[PathLike],

diff --git a/darwin/datatypes.py b/darwin/datatypes.py
@@ -244,6 +244,63 @@ def get_sub(self, annotation_type: str) -> Optional[SubAnnotation]:
                 return sub
         return None
 
+    def scale_coordinates(self, x_scale: float, y_scale: float) -> None:
+        """
+        Multiplies the coordinates of the annotation by the given values.
+
+        Parameters
+        ----------
+        x_scale : float
+            Scale factor for x coordinates
+        y_scale : float
+            Scale factor for y coordinates
+        """
+        if (
+            getattr(self, "annotation_class", None)
+            and self.annotation_class.name == "__raster_layer__"
+        ):
+            return
+
+        annotation_type = (
+            self.annotation_class.annotation_type
+            if hasattr(self, "annotation_class")
+            else None
+        )
+        if not annotation_type:
+            return
+
+        if annotation_type == "bounding_box":
+            self.data["x"] *= x_scale
+            self.data["y"] *= y_scale
+            self.data["w"] *= x_scale
+            self.data["h"] *= y_scale
+
+        elif annotation_type == "polygon":
+            for path in self.data["paths"]:
+                for point in path:
+                    point["x"] *= x_scale
+                    point["y"] *= y_scale
+
+        elif annotation_type == "ellipse":
+            self.data["center"]["x"] *= x_scale
+            self.data["center"]["y"] *= y_scale
+            self.data["radius"]["x"] *= x_scale
+            self.data["radius"]["y"] *= y_scale
+
+        elif annotation_type == "line":
+            for point in self.data["path"]:
+                point["x"] *= x_scale
+                point["y"] *= y_scale
+
+        elif annotation_type == "keypoint":
+            self.data["x"] *= x_scale
+            self.data["y"] *= y_scale
+
+        elif annotation_type == "skeleton":
+            for node in self.data["nodes"]:
+                node["x"] *= x_scale
+                node["y"] *= y_scale
+
 
 @dataclass(frozen=False, eq=True)
 class VideoAnnotation:

diff --git a/darwin/importer/importer.py b/darwin/importer/importer.py
@@ -20,7 +20,7 @@
     Tuple,
     Union,
 )
-
+import numpy as np
 
 from darwin.datatypes import (
     AnnotationFile,
@@ -117,7 +117,9 @@ def _find_and_parse(  # noqa: C901
     console: Optional[Console] = None,
     use_multi_cpu: bool = True,
     cpu_limit: int = 1,
-    remote_files_that_require_legacy_scaling: Optional[List[Path]] = None,
+    remote_files_that_require_legacy_scaling: Optional[
+        Dict[str, Dict[str, Any]]
+    ] = None,
 ) -> Optional[Iterable[dt.AnnotationFile]]:
     is_console = console is not None
 
@@ -1252,21 +1254,34 @@ def import_annotations(  # noqa: C901
     console.print("Retrieving local annotations ...", style="info")
     local_files = []
     local_files_missing_remotely = []
+
+    remote_files_targeted_by_import = _get_remote_files_targeted_by_import(
+        importer, file_paths, dataset, console, use_multi_cpu, cpu_limit
+    )
+    (
+        remote_files_that_require_legacy_nifti_scaling,
+        remote_files_that_require_pixel_to_mm_transform,
+    ) = _get_remote_medical_file_transform_requirements(remote_files_targeted_by_import)
+
     if importer.__module__ == "darwin.importer.formats.nifti":
-        remote_files_that_require_legacy_scaling = (
-            dataset._get_remote_files_that_require_legacy_scaling()
-        )
         maybe_parsed_files: Optional[Iterable[dt.AnnotationFile]] = _find_and_parse(
             importer,
             file_paths,
             console,
             use_multi_cpu,
             cpu_limit,
-            remote_files_that_require_legacy_scaling,
+            remote_files_that_require_legacy_nifti_scaling,
         )
     else:
         maybe_parsed_files: Optional[Iterable[dt.AnnotationFile]] = _find_and_parse(
-            importer, file_paths, console, use_multi_cpu, cpu_limit
+            importer,
+            file_paths,
+            console,
+            use_multi_cpu,
+            cpu_limit,
+        )
+        maybe_parsed_files = _scale_coordinates_by_pixdims(
+            maybe_parsed_files, remote_files_that_require_pixel_to_mm_transform
         )
 
     if not maybe_parsed_files:
@@ -2312,3 +2327,160 @@ def _split_payloads(
         payloads.append(current_payload)
 
     return payloads
+
+
+def _get_remote_files_targeted_by_import(
+    importer: Callable[[Path], Union[List[dt.AnnotationFile], dt.AnnotationFile, None]],
+    file_paths: List[PathLike],
+    dataset: "RemoteDataset",
+    console: Optional[Console] = None,
+    use_multi_cpu: bool = True,
+    cpu_limit: int = 1,
+) -> List[DatasetItem]:
+    """
+    Parses local annotations files for import and returns a list of remote dataset items
+    targeted by the import. Handles chunking of requests if there are many files to
+    avoid URL length issues.
+
+    Parameters
+    ----------
+    importer: Callable[[Path], Union[List[dt.AnnotationFile], dt.AnnotationFile, None]]
+        The importer used to parse local annotation files
+    file_paths: List[PathLike]
+        A list of local annotation files to be uploaded
+    dataset: RemoteDataset
+        The remote dataset to fetch files from
+    console: Optional[Console]
+        The console object
+    use_multi_cpu: bool
+        Whether to use multi-CPU processing
+    cpu_limit: int
+        The number of CPUs to use for processing
+
+    Returns
+    -------
+    List[DatasetItem]
+        A list of remote dataset items targeted by the import
+
+    Raises
+    ------
+    ValueError
+        If no files could be parsed or if the URL becomes too long even with minimum chunk size
+    """
+    maybe_parsed_files = _find_and_parse(
+        importer, file_paths, console, use_multi_cpu, cpu_limit
+    )
+    if not maybe_parsed_files:
+        raise ValueError("Not able to parse any files.")
+
+    remote_filenames = list({file.filename for file in maybe_parsed_files})
+    remote_filepaths = [file.full_path for file in maybe_parsed_files]
+
+    chunk_size = 100
+    all_remote_files: List[DatasetItem] = []
+    while chunk_size > 0:
+        try:
+            for i in range(0, len(remote_filenames), chunk_size):
+                chunk = remote_filenames[i : i + chunk_size]
+                remote_files = dataset.fetch_remote_files(filters={"item_names": chunk})
+                all_remote_files.extend(remote_files)
+            break
+        except RequestEntitySizeExceeded:
+            chunk_size -= 8
+            if chunk_size <= 0:
+                raise ValueError(
+                    "Unable to fetch remote file list - URL too long even with minimum chunk size."
+                )
+    return [
+        remote_file
+        for remote_file in all_remote_files
+        if remote_file.full_path in remote_filepaths
+    ]
+
+
+def _get_remote_medical_file_transform_requirements(
+    remote_files_targeted_by_import: List[DatasetItem],
+) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, List[str]]]:
+    """
+    This function parses the remote files targeted by the import. If the remote file is
+    a medical file, it checks if it requires legacy NifTI scaling or a pixel to mm transform.
+
+    If the file requires a pixel to mm transform, it select the correct pixdims values
+    based on the axis of acquisition.
+
+    Parameters
+    ----------
+    remote_files_targeted_by_import: List[DatasetItem]
+        The remote files targeted by the import
+
+    Returns
+    -------
+    Tuple[Dict[str, Dict[Path, Any]], Dict[Path, Any]]
+        A tuple of 2 dictionaries:
+        - remote_files_that_require_legacy_nifti_scaling: A dictionary of remote files
+        that require legacy NifTI scaling and the slot name to affine matrix mapping
+        - remote_files_that_require_pixel_to_mm_transform: A dictionary of remote files
+        that require a pixel to mm transform and the pixdims of the (x, y) axes
+    """
+    remote_files_that_require_legacy_nifti_scaling = {}
+    remote_files_that_require_pixel_to_mm_transform = {}
+    for remote_file in remote_files_targeted_by_import:
+        if not remote_file.is_medical:
+            continue
+        if remote_file.is_handled_by_monai:
+            slot_pixdim_map = {}
+            for slot in remote_file.slots:
+                slot_name = slot["slot_name"]
+                primary_plane = slot["metadata"]["medical"]["plane_map"][slot_name]
+                pixdims = slot["metadata"]["medical"]["pixdims"]
+                if primary_plane == "AXIAL":
+                    pixdims = [pixdims[0], pixdims[1]]
+                elif primary_plane == "SAGGITAL":
+                    pixdims = [pixdims[0], pixdims[2]]
+                elif primary_plane == "CORONAL":
+                    pixdims = [pixdims[1], pixdims[2]]
+                slot_pixdim_map[slot_name] = pixdims
+            remote_files_that_require_pixel_to_mm_transform[remote_file.full_path] = (
+                slot_pixdim_map
+            )
+        else:
+            slot_affine_map = {}
+            for slot in remote_file.slots:
+                slot_affine_map[slot["slot_name"]] = np.array(
+                    slot["metadata"]["medical"]["affine"],
+                    dtype=np.float64,
+                )
+            remote_files_that_require_legacy_nifti_scaling[remote_file.full_path] = (
+                slot_affine_map
+            )
+
+    return (
+        remote_files_that_require_legacy_nifti_scaling,
+        remote_files_that_require_pixel_to_mm_transform,
+    )
+
+
+def _scale_coordinates_by_pixdims(
+    maybe_parsed_files: List[dt.AnnotationFile],
+    remote_files_that_require_pixel_to_mm_transform: Dict[Path, Any],
+) -> List[dt.AnnotationFile]:
+    """
+    This function scales coordinates by the pixdims of the (x, y) axes.
+    """
+    if not remote_files_that_require_pixel_to_mm_transform:
+        return maybe_parsed_files
+    for file in maybe_parsed_files:
+        if file.full_path in remote_files_that_require_pixel_to_mm_transform:
+            for annotation in file.annotations:
+                slot_name = annotation.slot_names[0]
+                pixdims = remote_files_that_require_pixel_to_mm_transform[
+                    file.full_path
+                ][slot_name]
+                if isinstance(annotation, dt.VideoAnnotation):
+                    for frame_idx, frame_annotation in annotation.frames.items():
+                        frame_annotation.scale_coordinates(
+                            float(pixdims[0]), float(pixdims[1])
+                        )
+                elif isinstance(annotation, dt.Annotation):
+                    annotation.scale_coordinates(float(pixdims[0]), float(pixdims[1]))
+    return maybe_parsed_files
diff --git a/darwin/item.py b/darwin/item.py
@@ -63,6 +63,25 @@ def full_path(self) -> str:
         """
         return construct_full_path(self.path, self.filename)
 
+    @property
+    def is_medical(self) -> bool:
+        """
+        Whether this ``DatasetItem`` is a medical file or not.
+        """
+        return self.slots[0].get("metadata", {}).get("medical", {}) is not None
+
+    @property
+    def is_handled_by_monai(self) -> bool:
+        """
+        Whether this medical ``DatasetItem`` is handled by MONAI or not.
+        """
+        if not self.is_medical:
+            return False
+        return (
+            self.slots[0].get("metadata", {}).get("medical", {}).get("handler")
+            == "MONAI"
+        )
+
     @classmethod
     def parse(cls, raw: Dict[str, Any], dataset_slug: str = "n/a") -> "DatasetItem":
         """