Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DAR-3041][External] Change the behaviour of the force_slots argument of pull() #915

Merged
merged 6 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/EVENT_merge_to_master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ on:

permissions:
contents: read
id-token: write # Necessary for the generate documentation job

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand All @@ -24,6 +25,9 @@ jobs:
name: Documentation
uses: ./.github/workflows/JOB_generate_documentation.yml
secrets: inherit
permissions:
id-token: write
contents: read

warn_on_fail:
needs: [run_tests, documentation]
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/EVENT_tag.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

permissions:
contents: write
contents: read

jobs:
create_release:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/JOB_typecheck.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ jobs:
- name: MyPy typecheck
shell: bash
run: |
pip install pydantic==2.8.2
pip install pydantic
bash ${{ github.workspace }}/deploy/format_lint.sh typecheck ${{ inputs.files }}
20 changes: 11 additions & 9 deletions darwin/dataset/download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ def download_all_images_from_annotations(
Recreate folders
video_frames : bool, default: False
Pulls video frames images instead of video files
force_slots: bool
force_slots: bool, default: False
Pulls all slots of items into deeper file structure ({prefix}/{item_name}/{slot_name}/{file_name})
If False, all multi-slotted items and items with slots containing multiple source files will be downloaded as the deeper file structure

Returns
-------
Expand All @@ -90,7 +91,7 @@ def download_all_images_from_annotations(
if is_file_extension_allowed(image.name)
}

annotations_to_download_path = []
annotations_to_download_path: List = []
for annotation_path in annotations_path.glob(f"*.{annotation_format}"):
annotation = parse_darwin_json(annotation_path, count=0)
if annotation is None:
Expand All @@ -108,13 +109,14 @@ def download_all_images_from_annotations(
):
continue

annotations_to_download_path.append(annotation_path)
if len(annotation.slots) > 1:
force_slots = True
if force_slots:
force_slots_for_item = True
else:
force_slots_for_item = len(annotation.slots) > 1 or any(
len(slot.source_files) > 1 for slot in annotation.slots
)

for slot in annotation.slots:
if len(slot.source_files) > 1:
force_slots = True
annotations_to_download_path.append((annotation_path, force_slots_for_item))

if remove_extra:
annotations = (
Expand All @@ -138,7 +140,7 @@ def download_all_images_from_annotations(

# Create the generator with the partial functions
download_functions: List = []
for annotation_path in annotations_to_download_path:
for annotation_path, force_slots in annotations_to_download_path:
file_download_functions = lazy_download_image_from_annotation(
api_key,
annotation_path,
Expand Down
79 changes: 78 additions & 1 deletion tests/darwin/dataset/remote_dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
from darwin.client import Client
from darwin.config import Config
from darwin.dataset import RemoteDataset
from darwin.dataset.download_manager import _download_image_from_json_annotation
from darwin.dataset.download_manager import (
_download_image_from_json_annotation,
download_all_images_from_annotations,
)
from darwin.dataset.release import Release, ReleaseStatus
from darwin.dataset.remote_dataset_v2 import RemoteDatasetV2
from darwin.dataset.upload_manager import LocalFile, UploadHandlerV2
Expand All @@ -25,6 +28,12 @@
from tests.fixtures import *


@pytest.fixture
def mock_is_file_extension_allowed():
with patch("darwin.dataset.download_manager.is_file_extension_allowed") as mock:
yield mock


@pytest.fixture
def annotation_name() -> str:
return "test_video.json"
Expand Down Expand Up @@ -1373,3 +1382,71 @@ def test_omits_unavailable_releases_when_retry_is_false(
releases = remote_dataset.get_releases(include_unavailable=False)
assert len(releases) == 1
assert isinstance(releases[0], Release)


def test_force_slots_true(mock_is_file_extension_allowed):
mock_is_file_extension_allowed.return_value = True
with tempfile.TemporaryDirectory() as tmp_dir:
with zipfile.ZipFile("tests/data.zip") as zfile:
zfile.extractall(tmp_dir)
annotations_path = Path(tmp_dir) / "v7-darwin-json-v2/force_slots"
images_path = Path("images")
generator, count = download_all_images_from_annotations(
api_key="api_key",
annotations_path=annotations_path,
images_path=images_path,
force_slots=True,
force_replace=True,
)

download_funcs = list(generator())
planned_paths = [download_funcs[i].args[2] for i in range(count)]
expected_paths = [
Path("images/dir1/single_slotted_video_flat/0/mini_uct.mp4"),
Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_1.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_2.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_1.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_2.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_3.dcm"),
Path("images/dir1/single_slotted_image_flat/0/001.png"),
Path("images/dir1/multi_slotted_item_flat/0/000000580654.jpg"),
Path("images/dir1/multi_slotted_item_flat/1/000000580676.jpg"),
Path("images/dir1/multi_slotted_item_flat/2/000000580703.jpg"),
]
assert count == 10
for expected_path in expected_paths:
assert expected_path in planned_paths


def test_force_slots_false(mock_is_file_extension_allowed):
mock_is_file_extension_allowed.return_value = True
with tempfile.TemporaryDirectory() as tmp_dir:
with zipfile.ZipFile("tests/data.zip") as zfile:
zfile.extractall(tmp_dir)
annotations_path = Path(tmp_dir) / "v7-darwin-json-v2/force_slots"
images_path = Path("images")
generator, count = download_all_images_from_annotations(
api_key="api_key",
annotations_path=annotations_path,
images_path=images_path,
force_slots=False,
force_replace=True,
)

download_funcs = list(generator())
planned_paths = [download_funcs[i].args[2] for i in range(count)]
expected_paths = [
Path("images/dir1/single_slotted_video_flat.mp4"),
Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_1.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_2.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_1.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_2.dcm"),
Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_3.dcm"),
Path("images/dir1/single_slotted_image_flat.png"),
Path("images/dir1/multi_slotted_item_flat/0/000000580654.jpg"),
Path("images/dir1/multi_slotted_item_flat/1/000000580676.jpg"),
Path("images/dir1/multi_slotted_item_flat/2/000000580703.jpg"),
]
assert count == 10
for expected_path in expected_paths:
assert expected_path in planned_paths
Binary file modified tests/data.zip
Binary file not shown.