Skip to content

Commit

Permalink
[DAR-3041][External] Change the behaviour of the force_slots argume…
Browse files Browse the repository at this point in the history
…nt of `pull()` (#915)

* Test commit

* Don't download single-slotted items as multi-slotted items when force_slots=False

* Unit tests + simpler approach

* GHA permissions
  • Loading branch information
JBWilkie authored Aug 22, 2024
1 parent 477c09e commit ad35478
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 12 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/EVENT_merge_to_master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ on:

permissions:
contents: read
id-token: write # Necessary for the generate documentation job

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand All @@ -24,6 +25,9 @@ jobs:
name: Documentation
uses: ./.github/workflows/JOB_generate_documentation.yml
secrets: inherit
permissions:
id-token: write
contents: read

warn_on_fail:
needs: [run_tests, documentation]
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/EVENT_tag.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

permissions:
contents: write
contents: read

jobs:
create_release:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/JOB_typecheck.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ jobs:
- name: MyPy typecheck
shell: bash
run: |
pip install pydantic==2.8.2
pip install pydantic
bash ${{ github.workspace }}/deploy/format_lint.sh typecheck ${{ inputs.files }}
20 changes: 11 additions & 9 deletions darwin/dataset/download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ def download_all_images_from_annotations(
Recreate folders
video_frames : bool, default: False
Pulls video frames images instead of video files
force_slots: bool
force_slots: bool, default: False
Pulls all slots of items into deeper file structure ({prefix}/{item_name}/{slot_name}/{file_name})
If False, all multi-slotted items and items with slots containing multiple source files will be downloaded as the deeper file structure
Returns
-------
Expand All @@ -90,7 +91,7 @@ def download_all_images_from_annotations(
if is_file_extension_allowed(image.name)
}

annotations_to_download_path = []
annotations_to_download_path: List = []
for annotation_path in annotations_path.glob(f"*.{annotation_format}"):
annotation = parse_darwin_json(annotation_path, count=0)
if annotation is None:
Expand All @@ -108,13 +109,14 @@ def download_all_images_from_annotations(
):
continue

annotations_to_download_path.append(annotation_path)
if len(annotation.slots) > 1:
force_slots = True
if force_slots:
force_slots_for_item = True
else:
force_slots_for_item = len(annotation.slots) > 1 or any(
len(slot.source_files) > 1 for slot in annotation.slots
)

for slot in annotation.slots:
if len(slot.source_files) > 1:
force_slots = True
annotations_to_download_path.append((annotation_path, force_slots_for_item))

if remove_extra:
annotations = (
Expand All @@ -138,7 +140,7 @@ def download_all_images_from_annotations(

# Create the generator with the partial functions
download_functions: List = []
for annotation_path in annotations_to_download_path:
for annotation_path, force_slots in annotations_to_download_path:
file_download_functions = lazy_download_image_from_annotation(
api_key,
annotation_path,
Expand Down
79 changes: 78 additions & 1 deletion tests/darwin/dataset/remote_dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
from darwin.client import Client
from darwin.config import Config
from darwin.dataset import RemoteDataset
from darwin.dataset.download_manager import _download_image_from_json_annotation
from darwin.dataset.download_manager import (
_download_image_from_json_annotation,
download_all_images_from_annotations,
)
from darwin.dataset.release import Release, ReleaseStatus
from darwin.dataset.remote_dataset_v2 import RemoteDatasetV2
from darwin.dataset.upload_manager import LocalFile, UploadHandlerV2
Expand All @@ -25,6 +28,12 @@
from tests.fixtures import *


@pytest.fixture
def mock_is_file_extension_allowed():
with patch("darwin.dataset.download_manager.is_file_extension_allowed") as mock:
yield mock


@pytest.fixture
def annotation_name() -> str:
return "test_video.json"
Expand Down Expand Up @@ -1373,3 +1382,71 @@ def test_omits_unavailable_releases_when_retry_is_false(
releases = remote_dataset.get_releases(include_unavailable=False)
assert len(releases) == 1
assert isinstance(releases[0], Release)


def test_force_slots_true(mock_is_file_extension_allowed):
mock_is_file_extension_allowed.return_value = True
with tempfile.TemporaryDirectory() as tmp_dir:
with zipfile.ZipFile("tests/data.zip") as zfile:
zfile.extractall(tmp_dir)
annotations_path = Path(tmp_dir) / "v7-darwin-json-v2/force_slots"
images_path = Path("images")
generator, count = download_all_images_from_annotations(
api_key="api_key",
annotations_path=annotations_path,
images_path=images_path,
force_slots=True,
force_replace=True,
)

download_funcs = list(generator())
planned_paths = [download_funcs[i].args[2] for i in range(count)]
expected_paths = [
Path("images/dir1/single_slotted_video_flat/0/mini_uct.mp4"),
Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_1.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_2.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_1.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_2.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_3.dcm"),
Path("images/dir1/single_slotted_image_flat/0/001.png"),
Path("images/dir1/multi_slotted_item_flat/0/000000580654.jpg"),
Path("images/dir1/multi_slotted_item_flat/1/000000580676.jpg"),
Path("images/dir1/multi_slotted_item_flat/2/000000580703.jpg"),
]
assert count == 10
for expected_path in expected_paths:
assert expected_path in planned_paths


def test_force_slots_false(mock_is_file_extension_allowed):
mock_is_file_extension_allowed.return_value = True
with tempfile.TemporaryDirectory() as tmp_dir:
with zipfile.ZipFile("tests/data.zip") as zfile:
zfile.extractall(tmp_dir)
annotations_path = Path(tmp_dir) / "v7-darwin-json-v2/force_slots"
images_path = Path("images")
generator, count = download_all_images_from_annotations(
api_key="api_key",
annotations_path=annotations_path,
images_path=images_path,
force_slots=False,
force_replace=True,
)

download_funcs = list(generator())
planned_paths = [download_funcs[i].args[2] for i in range(count)]
expected_paths = [
Path("images/dir1/single_slotted_video_flat.mp4"),
Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_1.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_2.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_1.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_2.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_3.dcm"),
Path("images/dir1/single_slotted_image_flat.png"),
Path("images/dir1/multi_slotted_item_flat/0/000000580654.jpg"),
Path("images/dir1/multi_slotted_item_flat/1/000000580676.jpg"),
Path("images/dir1/multi_slotted_item_flat/2/000000580703.jpg"),
]
assert count == 10
for expected_path in expected_paths:
assert expected_path in planned_paths
Binary file modified tests/data.zip
Binary file not shown.

0 comments on commit ad35478

Please sign in to comment.