diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index 303cb884d..fcfb3cf8d 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -996,24 +996,6 @@ def import_annotation(self, item_id: ItemId, payload: Dict[str, Any]) -> None: """ ... - @abstractmethod - def _get_remote_files_that_require_legacy_scaling(self) -> List[Path]: - """ - Get all remote files that have been scaled upon upload. These files require that - NifTI annotations are similarly scaled during import - - Parameters - ---------- - dataset : RemoteDataset - The remote dataset to get the files from - - Returns - ------- - List[Path] - A list of full remote paths of dataset items that require NifTI annotations to be scaled - """ - ... - @property def remote_path(self) -> Path: """Returns an URL specifying the location of the remote dataset.""" diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py index 8956a8a3a..e4b56b22a 100644 --- a/darwin/dataset/remote_dataset_v2.py +++ b/darwin/dataset/remote_dataset_v2.py @@ -11,7 +11,6 @@ Tuple, Union, ) -import numpy as np from pydantic import ValidationError from requests.models import Response @@ -873,51 +872,6 @@ def register_multi_slotted( print(f"Reistration complete. Check your items in the dataset: {self.slug}") return results - def _get_remote_files_that_require_legacy_scaling( - self, - ) -> Dict[str, Dict[str, Any]]: - """ - Get all remote files that have been scaled upon upload. These files require that - NifTI annotations are similarly scaled during import. - - The in-platform affines are returned for each legacy file, as this is required - to properly re-orient the annotations during import. - - Parameters - ---------- - dataset : RemoteDataset - The remote dataset to get the files from - - Returns - ------- - Dict[str, Dict[str, Any]] - A dictionary of remote file full paths to their slot affine maps - """ - remote_files_that_require_legacy_scaling = {} - remote_files = self.fetch_remote_files( - filters={"statuses": ["new", "annotate", "review", "complete", "archived"]} - ) - for remote_file in remote_files: - if not remote_file.slots[0].get("metadata", {}).get("medical", {}): - continue - if not ( - remote_file.slots[0] - .get("metadata", {}) - .get("medical", {}) - .get("handler") - ): - slot_affine_map = {} - for slot in remote_file.slots: - slot_affine_map[slot["slot_name"]] = np.array( - slot["metadata"]["medical"]["affine"], - dtype=np.float64, - ) - remote_files_that_require_legacy_scaling[ - Path(remote_file.full_path) - ] = slot_affine_map - - return remote_files_that_require_legacy_scaling - def _find_files_to_upload_as_multi_file_items( search_files: List[PathLike], diff --git a/darwin/datatypes.py b/darwin/datatypes.py index 189a6ce94..c0b080fad 100644 --- a/darwin/datatypes.py +++ b/darwin/datatypes.py @@ -244,6 +244,63 @@ def get_sub(self, annotation_type: str) -> Optional[SubAnnotation]: return sub return None + def scale_coordinates(self, x_scale: float, y_scale: float) -> None: + """ + Multiplies the coordinates of the annotation by the given values. + + Parameters + ---------- + x_scale : float + Scale factor for x coordinates + y_scale : float + Scale factor for y coordinates + """ + if ( + getattr(self, "annotation_class", None) + and self.annotation_class.name == "__raster_layer__" + ): + return + + annotation_type = ( + self.annotation_class.annotation_type + if hasattr(self, "annotation_class") + else None + ) + if not annotation_type: + return + + if annotation_type == "bounding_box": + self.data["x"] *= x_scale + self.data["y"] *= y_scale + self.data["w"] *= x_scale + self.data["h"] *= y_scale + + elif annotation_type == "polygon": + for path in self.data["paths"]: + for point in path: + point["x"] *= x_scale + point["y"] *= y_scale + + elif annotation_type == "ellipse": + self.data["center"]["x"] *= x_scale + self.data["center"]["y"] *= y_scale + self.data["radius"]["x"] *= x_scale + self.data["radius"]["y"] *= y_scale + + elif annotation_type == "line": + for point in self.data["path"]: + point["x"] *= x_scale + point["y"] *= y_scale + + elif annotation_type == "keypoint": + self.data["x"] *= x_scale + self.data["y"] *= y_scale + + elif annotation_type == "skeleton": + for node in self.data["nodes"]: + node["x"] *= x_scale + node["y"] *= y_scale + @dataclass(frozen=False, eq=True) class VideoAnnotation: diff --git a/darwin/importer/importer.py b/darwin/importer/importer.py index 0d4764b84..6370bdd39 100644 --- a/darwin/importer/importer.py +++ b/darwin/importer/importer.py @@ -20,7 +20,7 @@ Tuple, Union, ) - +import numpy as np from darwin.datatypes import ( AnnotationFile, @@ -117,7 +117,9 @@ def _find_and_parse( # noqa: C901 console: Optional[Console] = None, use_multi_cpu: bool = True, cpu_limit: int = 1, - remote_files_that_require_legacy_scaling: Optional[List[Path]] = None, + remote_files_that_require_legacy_scaling: Optional[ + Dict[str, Dict[str, Any]] + ] = None, ) -> Optional[Iterable[dt.AnnotationFile]]: is_console = console is not None @@ -1252,21 +1254,34 @@ def import_annotations( # noqa: C901 console.print("Retrieving local annotations ...", style="info") local_files = [] local_files_missing_remotely = [] + + remote_files_targeted_by_import = _get_remote_files_targeted_by_import( + importer, file_paths, dataset, console, use_multi_cpu, cpu_limit + ) + ( + remote_files_that_require_legacy_nifti_scaling, + remote_files_that_require_pixel_to_mm_transform, + ) = _get_remote_medical_file_transform_requirements(remote_files_targeted_by_import) + if importer.__module__ == "darwin.importer.formats.nifti": - remote_files_that_require_legacy_scaling = ( - dataset._get_remote_files_that_require_legacy_scaling() - ) maybe_parsed_files: Optional[Iterable[dt.AnnotationFile]] = _find_and_parse( importer, file_paths, console, use_multi_cpu, cpu_limit, - remote_files_that_require_legacy_scaling, + remote_files_that_require_legacy_nifti_scaling, ) else: maybe_parsed_files: Optional[Iterable[dt.AnnotationFile]] = _find_and_parse( - importer, file_paths, console, use_multi_cpu, cpu_limit + importer, + file_paths, + console, + use_multi_cpu, + cpu_limit, + ) + maybe_parsed_files = _scale_coordinates_by_pixdims( + maybe_parsed_files, remote_files_that_require_pixel_to_mm_transform ) if not maybe_parsed_files: @@ -2312,3 +2327,160 @@ def _split_payloads( payloads.append(current_payload) return payloads + + +def _get_remote_files_targeted_by_import( + importer: Callable[[Path], Union[List[dt.AnnotationFile], dt.AnnotationFile, None]], + file_paths: List[PathLike], + dataset: "RemoteDataset", + console: Optional[Console] = None, + use_multi_cpu: bool = True, + cpu_limit: int = 1, +) -> List[DatasetItem]: + """ + Parses local annotations files for import and returns a list of remote dataset items + targeted by the import. Handles chunking of requests if there are many files to + avoid URL length issues. + + Parameters + ---------- + importer: Callable[[Path], Union[List[dt.AnnotationFile], dt.AnnotationFile, None]] + The importer used to parse local annotation files + file_paths: List[PathLike] + A list of local annotation files to be uploaded + dataset: RemoteDataset + The remote dataset to fetch files from + console: Optional[Console] + The console object + use_multi_cpu: bool + Whether to use multi-CPU processing + cpu_limit: int + The number of CPUs to use for processing + + Returns + ------- + List[DatasetItem] + A list of remote dataset items targeted by the import + + Raises + ------ + ValueError + If no files could be parsed or if the URL becomes too long even with minimum chunk size + """ + maybe_parsed_files = _find_and_parse( + importer, file_paths, console, use_multi_cpu, cpu_limit + ) + if not maybe_parsed_files: + raise ValueError("Not able to parse any files.") + + remote_filenames = list({file.filename for file in maybe_parsed_files}) + remote_filepaths = [file.full_path for file in maybe_parsed_files] + + chunk_size = 100 + all_remote_files: List[DatasetItem] = [] + while chunk_size > 0: + try: + for i in range(0, len(remote_filenames), chunk_size): + chunk = remote_filenames[i : i + chunk_size] + remote_files = dataset.fetch_remote_files(filters={"item_names": chunk}) + all_remote_files.extend(remote_files) + break + except RequestEntitySizeExceeded: + chunk_size -= 8 + if chunk_size <= 0: + raise ValueError( + "Unable to fetch remote file list - URL too long even with minimum chunk size." + ) + return [ + remote_file + for remote_file in all_remote_files + if remote_file.full_path in remote_filepaths + ] + + +def _get_remote_medical_file_transform_requirements( + remote_files_targeted_by_import: List[DatasetItem], +) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, List[str]]]: + """ + This function parses the remote files targeted by the import. If the remote file is + a medical file, it checks if it requires legacy NifTI scaling or a pixel to mm transform. + + If the file requires a pixel to mm transform, it select the correct pixdims values + based on the axis of acquisition. + + Parameters + ---------- + remote_files_targeted_by_import: List[DatasetItem] + The remote files targeted by the import + + Returns + ------- + Tuple[Dict[str, Dict[Path, Any]], Dict[Path, Any]] + A tuple of 2 dictionaries: + - remote_files_that_require_legacy_nifti_scaling: A dictionary of remote files + that require legacy NifTI scaling and the slot name to affine matrix mapping + - remote_files_that_require_pixel_to_mm_transform: A dictionary of remote files + that require a pixel to mm transform and the pixdims of the (x, y) axes + """ + remote_files_that_require_legacy_nifti_scaling = {} + remote_files_that_require_pixel_to_mm_transform = {} + for remote_file in remote_files_targeted_by_import: + if not remote_file.is_medical: + continue + if remote_file.is_handled_by_monai: + slot_pixdim_map = {} + for slot in remote_file.slots: + slot_name = slot["slot_name"] + primary_plane = slot["metadata"]["medical"]["plane_map"][slot_name] + pixdims = slot["metadata"]["medical"]["pixdims"] + if primary_plane == "AXIAL": + pixdims = [pixdims[0], pixdims[1]] + elif primary_plane == "SAGGITAL": + pixdims = [pixdims[0], pixdims[2]] + elif primary_plane == "CORONAL": + pixdims = [pixdims[1], pixdims[2]] + slot_pixdim_map[slot_name] = pixdims + remote_files_that_require_pixel_to_mm_transform[remote_file.full_path] = ( + slot_pixdim_map + ) + else: + slot_affine_map = {} + for slot in remote_file.slots: + slot_affine_map[slot["slot_name"]] = np.array( + slot["metadata"]["medical"]["affine"], + dtype=np.float64, + ) + remote_files_that_require_legacy_nifti_scaling[remote_file.full_path] = ( + slot_affine_map + ) + + return ( + remote_files_that_require_legacy_nifti_scaling, + remote_files_that_require_pixel_to_mm_transform, + ) + + +def _scale_coordinates_by_pixdims( + maybe_parsed_files: List[dt.AnnotationFile], + remote_files_that_require_pixel_to_mm_transform: Dict[Path, Any], +) -> List[dt.AnnotationFile]: + """ + This function scales coordinates by the pixdims of the (x, y) axes. + """ + if not remote_files_that_require_pixel_to_mm_transform: + return maybe_parsed_files + for file in maybe_parsed_files: + if file.full_path in remote_files_that_require_pixel_to_mm_transform: + for annotation in file.annotations: + slot_name = annotation.slot_names[0] + pixdims = remote_files_that_require_pixel_to_mm_transform[ + file.full_path + ][slot_name] + if isinstance(annotation, dt.VideoAnnotation): + for frame_idx, frame_annotation in annotation.frames.items(): + frame_annotation.scale_coordinates( + float(pixdims[0]), float(pixdims[1]) + ) + elif isinstance(annotation, dt.Annotation): + annotation.scale_coordinates(float(pixdims[0]), float(pixdims[1])) + return maybe_parsed_files diff --git a/darwin/item.py b/darwin/item.py index 9691cf6cc..9878fd1ac 100644 --- a/darwin/item.py +++ b/darwin/item.py @@ -63,6 +63,25 @@ def full_path(self) -> str: """ return construct_full_path(self.path, self.filename) + @property + def is_medical(self) -> bool: + """ + Whether this ``DatasetItem`` is a medical file or not. + """ + return self.slots[0].get("metadata", {}).get("medical", {}) is not None + + @property + def is_handled_by_monai(self) -> bool: + """ + Whether this medical ``DatasetItem`` is handled by MONAI or not. + """ + if not self.is_medical: + return False + return ( + self.slots[0].get("metadata", {}).get("medical", {}).get("handler") + == "MONAI" + ) + @classmethod def parse(cls, raw: Dict[str, Any], dataset_slug: str = "n/a") -> "DatasetItem": """ diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index 0fb12d703..e134fbb85 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -1950,7 +1950,7 @@ def mock_remote_files(self): ] @patch.object(RemoteDatasetV2, "fetch_remote_files") - def test_get_remote_files_that_require_legacy_scaling( + def test_get_remote_files_that_require_legacy_nifti_scaling( self, mock_fetch_remote_files, mock_remote_files ): mock_fetch_remote_files.return_value = mock_remote_files @@ -1962,7 +1962,7 @@ def test_get_remote_files_that_require_legacy_scaling( dataset_id=1, ) - result = remote_dataset._get_remote_files_that_require_legacy_scaling() + result = remote_dataset._get_remote_files_that_require_legacy_nifti_scaling() assert Path("/path/to/file/filename") in result np.testing.assert_array_equal( result[Path("/path/to/file/filename")]["0"], np.array([[-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]]) # type: ignore