Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DAR-4305][External] Cancel push operations if there are no valid files to upload, and correctly remove invalid files #944

Merged
merged 1 commit into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions darwin/dataset/upload_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,18 +452,20 @@ def skip_existing_full_remote_filepaths(self) -> None:
local_files_to_remove.extend(multi_file_item.files)
multi_file_items_to_remove.append(multi_file_item)
console.print(
f"The remote filepath {multi_file_item.full_path} is already occupied by a dataset item in the {self.dataset.slug} dataset. Skipping upload of item.",
f"The remote filepath {multi_file_item.full_path} is already occupied by a dataset item in the `{self.dataset.slug}` dataset. Skipping upload of item.",
style="warning",
)
if self.local_files:
for local_file in self.local_files:
if Path(local_file.full_path) in full_remote_filepaths:
if (
Path(local_file.full_path) in full_remote_filepaths
and local_file not in local_files_to_remove
):
local_files_to_remove.append(local_file)
console.print(
f"The remote filepath {local_file.full_path} already exists in the {self.dataset.slug} dataset. Skipping upload of item.",
f"The remote filepath {local_file.full_path} already exists in the `{self.dataset.slug}` dataset. Skipping upload of item.",
style="warning",
)

self.local_files = [
local_file
for local_file in self.local_files
Expand All @@ -476,6 +478,11 @@ def skip_existing_full_remote_filepaths(self) -> None:
if multi_file_item not in multi_file_items_to_remove
]

if not self.local_files and not self.multi_file_items:
raise ValueError(
"All items to be uploaded have paths that already exist in the remote dataset. No items to upload."
)

def prepare_upload(
self,
) -> Optional[Iterator[Callable[[Optional[ByteReadCallback]], None]]]:
Expand Down
31 changes: 28 additions & 3 deletions tests/darwin/dataset/upload_manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ def test_request_upload_is_not_called_on_init(
dataset: RemoteDataset, request_upload_endpoint: str
):
with patch.object(dataset, "fetch_remote_files", return_value=[]):
upload_handler = UploadHandler.build(dataset, [])
with patch.object(
UploadHandler, "skip_existing_full_remote_filepaths", return_value=[]
):
upload_handler = UploadHandler.build(dataset, [])

assert upload_handler.pending_count == 0
assert upload_handler.blocked_count == 0
Expand Down Expand Up @@ -446,7 +449,7 @@ def test_skip_existing_full_remote_filepaths_with_local_files():
assert local_file_2 in upload_handler.local_files

mock_print.assert_any_call(
"The remote filepath /existing_file_1.jpg already exists in the test-dataset dataset. Skipping upload of item.",
"The remote filepath /existing_file_1.jpg already exists in the `test-dataset` dataset. Skipping upload of item.",
style="warning",
)

Expand Down Expand Up @@ -475,6 +478,28 @@ def test_skip_existing_full_remote_filepaths_with_multi_file_items():

# Verify that the correct warning was printed
mock_print.assert_any_call(
"The remote filepath /existing_multi_file_item.jpg is already occupied by a dataset item in the test-dataset dataset. Skipping upload of item.",
"The remote filepath /existing_multi_file_item.jpg is already occupied by a dataset item in the `test-dataset` dataset. Skipping upload of item.",
style="warning",
)


def test_skip_existing_full_remote_filepaths_raises_if_no_files_left():
mock_dataset = MagicMock()
mock_dataset.fetch_remote_files.return_value = [
MagicMock(full_path="/existing_multi_file_item_1.jpg"),
MagicMock(full_path="/existing_multi_file_item_2.jpg"),
]
mock_dataset.slug = "test-dataset"

multi_file_item_1 = MagicMock(
full_path="/existing_multi_file_item_1.jpg", files=[MagicMock()]
)
multi_file_item_2 = MagicMock(
full_path="/existing_multi_file_item_2.jpg", files=[MagicMock()]
)

with pytest.raises(
ValueError,
match="All items to be uploaded have paths that already exist in the remote dataset. No items to upload.",
):
UploadHandlerV2(mock_dataset, [], [multi_file_item_1, multi_file_item_2])