Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: s3 data validation: annotations, frames, gains #207

Merged
merged 46 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
b5c499c
Port over cryoet-portal s3 data validation scripts created by @uermel
daniel-ji Aug 1, 2024
ad042c5
remove some of utz validation scripts from pr
daniel-ji Aug 6, 2024
c6ebe05
make annotation tests separate
daniel-ji Aug 6, 2024
ce746ca
fix pytest
daniel-ji Aug 6, 2024
d610b51
fix pytest again
daniel-ji Aug 7, 2024
42cafc9
fix pytest for real
daniel-ji Aug 7, 2024
8843332
delete yarn.lock
daniel-ji Aug 7, 2024
c8d7ec2
refactor s3 data validation data.py
daniel-ji Aug 7, 2024
fcd6598
work on s3 data validation for annotations
daniel-ji Aug 7, 2024
9acdfb7
work on s3 data validation, clean things up
daniel-ji Aug 8, 2024
1c2ab50
Merge branch 'chanzuckerberg:main' into s3-data-validation-annotations
daniel-ji Aug 8, 2024
3bdfbb1
refactor annotation testing, simplify fixtures & tests
daniel-ji Aug 9, 2024
d1ee0fe
add more annotation validation tests
daniel-ji Aug 9, 2024
aed8da0
fix type annotations for data validation tests
daniel-ji Aug 9, 2024
753b310
small fixes for annotation validation tests
daniel-ji Aug 9, 2024
7472974
clean up data validation
daniel-ji Aug 9, 2024
065458b
add deposition / dataset / photos tests
daniel-ji Aug 10, 2024
9c147b4
use omezarr to read zarr headers for s3 validation
daniel-ji Aug 13, 2024
f6de8b9
Merge branch 'daniel-ji/s3-data-validation-annotations' into daniel-j…
daniel-ji Aug 13, 2024
648d44d
work on frames validation
daniel-ji Aug 14, 2024
5ed945c
improve s3 frame data validation reporting
daniel-ji Aug 14, 2024
1456a5a
refactor annotation validation
daniel-ji Aug 14, 2024
5284505
Merge branch 'daniel-ji/s3-data-validation-annotations' into daniel-j…
daniel-ji Aug 14, 2024
fee8f88
Merge branch 'daniel-ji/s3-data-validation-dataset-deposition-photos'…
daniel-ji Aug 14, 2024
7aa6a2d
work on gain and frames validation
daniel-ji Aug 16, 2024
c044270
Merge branch 'main' into daniel-ji/s3-data-validation-frames-gains
daniel-ji Aug 16, 2024
75708bb
small validation improvements
daniel-ji Aug 16, 2024
58e843d
cleanup code a bit
daniel-ji Aug 16, 2024
f3c3b12
cleanup s3 data validation
daniel-ji Aug 17, 2024
cf7ac52
don't let ome zarr major version upgrades occur
daniel-ji Aug 19, 2024
c149712
Merge branch 'main' into daniel-ji/s3-data-validation-frames-gains
daniel-ji Aug 21, 2024
2a307da
more frames & gain fixes based on utz and anchi feedback
daniel-ji Aug 22, 2024
3cbddf3
comment out mrcbz2 data validation for now, will fix
daniel-ji Aug 22, 2024
df2fd8e
more fixes
daniel-ji Aug 22, 2024
bb01a3c
fixes and more tests
daniel-ji Aug 23, 2024
4e26e40
fix test text
daniel-ji Aug 24, 2024
ecbbd60
remove file_size fs method, not needed
daniel-ji Aug 26, 2024
67aa2f3
frames / gains fixes
daniel-ji Aug 26, 2024
6b06e30
support multiple gains, also check format
daniel-ji Aug 26, 2024
a7f8f30
improve frame / gains tests
daniel-ji Aug 26, 2024
2aa9fd1
more small fixes
daniel-ji Aug 27, 2024
0ed792d
fxi mdoc file loading
daniel-ji Aug 27, 2024
251214b
fix gains
daniel-ji Aug 28, 2024
f1e934f
frames format warning instead of error
daniel-ji Aug 29, 2024
5a0566c
Merge branch 'main' into daniel-ji/s3-data-validation-frames-gains
daniel-ji Aug 30, 2024
f1b032f
Merge branch 'main' into daniel-ji/s3-data-validation-frames-gains
daniel-ji Sep 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions ingestion_tools/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions ingestion_tools/pyproject.toml
Copy link
Contributor Author

@daniel-ji daniel-ji Aug 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reading frames files (*.tiff / *.eer)

Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ cryoet-data-portal-neuroglancer = { git = "https://github.com/chanzuckerberg/cry
distinctipy = "^1.3.4"
imodmodel = "0.0.12"
mdocfile = "^0.1.2"
tifffile = "^2024.8.10"

[tool.black]
line-length = 120
Expand Down
10 changes: 10 additions & 0 deletions ingestion_tools/scripts/common/fs.py
manasaV3 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def glob(self, *args: list[str]) -> list[str]:
def open(self, path: str, mode: str) -> TextIOBase:
pass

@abstractmethod
def size(self, path: str) -> int:
pass

@abstractmethod
def localreadable(self, path: str) -> str:
pass
Expand Down Expand Up @@ -86,6 +90,9 @@ def glob(self, *args: list[str]) -> list[str]:
def open(self, path: str, mode: str) -> TextIOBase:
return self.s3fs.open(path, mode)

def size(self, path: str) -> int:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work for directories as well? If not we should call it file_size.

return self.s3fs.size(path)

def localreadable(self, path: str) -> str:
local_dest_file = os.path.join(self.tmpdir, path)
# Don't re-download it if it's already available.
Expand Down Expand Up @@ -178,6 +185,9 @@ def glob(self, *args: list[str]) -> list[str]:
def open(self, path: str, mode: str) -> TextIOBase:
return open(path, mode) # noqa

def size(self, path: str) -> int:
return os.path.getsize(path)

def localreadable(self, path: str) -> str:
return path

Expand Down
3 changes: 3 additions & 0 deletions ingestion_tools/scripts/data_validation/__init__.py
Copy link
Contributor Author

@daniel-ji daniel-ji Aug 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import pytest

pytest.register_assert_rewrite("tests.helper_mrc", "tests.helper_images", "tests.annotation.helper_point")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets add a comment here to explain why this is needed.

3 changes: 3 additions & 0 deletions ingestion_tools/scripts/data_validation/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ def pytest_configure(config: pytest.Config) -> None:
# Register markers
config.addinivalue_line("markers", "annotation: Tests concerning the annotation data.")
config.addinivalue_line("markers", "dataset: Tests concerning the dataset.")
config.addinivalue_line("markers", "deposition: Tests concerning the deposition data.")
config.addinivalue_line("markers", "frame: Tests concerning the frames.")
config.addinivalue_line("markers", "gain: Tests concerning the gain files.")
config.addinivalue_line("markers", "run: Tests concerning the runs.")
config.addinivalue_line("markers", "tiltseries: Tests concerning the tiltseries.")
config.addinivalue_line("markers", "tomogram: Tests concerning the tomogram.")
Expand Down
213 changes: 152 additions & 61 deletions ingestion_tools/scripts/data_validation/fixtures/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@
"""

import json
from typing import Dict, List
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Union

import mdocfile
import ndjson
import pandas as pd
import pytest
import tifffile
import tifffile.tifffile
import zarr
from mrcfile.mrcinterpreter import MrcInterpreter
from ome_zarr.io import ZarrLocation

from common.fs import FileSystemApi

Expand All @@ -18,24 +23,98 @@
# ==================================================================================================


def get_header(mrcfile: str) -> MrcInterpreter:
"""Get the mrc file header for a tilt series without downloading the entire file."""
fs = FileSystemApi.get_fs_api(mode="s3", force_overwrite=False)
with fs.open(mrcfile, "rb") as f:
header = MrcInterpreter(iostream=f, permissive=True, header_only=True)
return header
def get_header(mrcfile: str, fs: FileSystemApi) -> MrcInterpreter:
"""Get the mrc file headers for a list of mrc files."""
try:
with fs.open(mrcfile, "rb") as f:
header = MrcInterpreter(iostream=f, permissive=True, header_only=True)
return header
except Exception as e:
pytest.fail(f"Failed to get header for {mrcfile}: {e}")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ improve the error messages on the get_header function


def get_zarr_headers(zarrfile: str, fs: FileSystemApi) -> Dict[str, Dict]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should probably call this, getting zarr metadata as opposed to header.

"""Get the zattrs and zarray data for a zarr volume file."""
expected_subfolders = {f"{zarrfile}/{i}" for i in range(3)}.union({f"{zarrfile}/.zattrs", f"{zarrfile}/.zgroup"})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can change the variable name to be children maybe?

actual_subfolders = {"s3://" + folder for folder in fs.glob(zarrfile + "/*")}
if expected_subfolders != actual_subfolders:
pytest.fail(f"Expected zarr subfolders: {expected_subfolders}, Actual zarr subfolders: {actual_subfolders}")

fsstore = zarr.storage.FSStore(url=zarrfile, mode="r", fs=fs.s3fs, dimension_separator="/")
manasaV3 marked this conversation as resolved.
Show resolved Hide resolved
fsstore_subfolders = set(fsstore.listdir())
expected_fsstore_subfolders = {str(i) for i in range(3)}.union({".zattrs", ".zgroup"})
if expected_fsstore_subfolders != fsstore_subfolders:
pytest.fail(f"Expected zarr subfolders: {expected_subfolders}, Actual zarr subfolders: {fsstore_subfolders}")

loc = ZarrLocation(fsstore)
zarrays = {}
for i in range(3):
zarrays[i] = json.loads(fsstore[str(i) + "/.zarray"].decode())
return {"zattrs": loc.root_attrs, "zarrays": zarrays}

Copy link
Contributor Author

@daniel-ji daniel-ji Aug 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ gets zarr header data, while ensuring all the data is properly retrieved


def get_headers(mrcfiles: List[str]) -> Dict[str, MrcInterpreter]:
"""Get the mrc file headers for a list of mrc files."""
headers = {}
for file in mrcfiles:
try:
headers[file] = get_header(file)
except Exception as _:
pytest.fail(f"Failed to get header for {file}")
# ==================================================================================================
# Dataset fixtures
# ==================================================================================================

return headers
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to retrieve multiple headers, since we never have multiple mrc files for a given entity


@pytest.fixture(scope="session")
def dataset_metadata(dataset_metadata_file: str, filesystem: FileSystemApi) -> Dict:
"""Load the dataset metadata."""
with filesystem.open(dataset_metadata_file, "r") as f:
return json.load(f)


# ==================================================================================================
# Frame fixtures
# ==================================================================================================


@pytest.fixture(scope="session")
def frames_filesizes(frames_files: List[str], filesystem: FileSystemApi) -> Dict[str, int]:
"""Get the file sizes for a list of frame files."""
return {frame_file: filesystem.size(frame_file) for frame_file in frames_files}
Copy link
Contributor

@manasaV3 manasaV3 Aug 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might not get a lot of value from this test, as a non-zero file size doesn't guarantee the validity of the data.



@pytest.fixture(scope="session")
def frames_headers(
frames_files: List[str],
filesystem: FileSystemApi,
) -> Dict[str, Union[tifffile.TiffPages, MrcInterpreter]]:
"""Get the headers for a list of frame files."""

def open_frame(frame_file: str):
if frame_file.endswith(".mrc"):
return (frame_file, get_header(frame_file, filesystem))
elif frame_file.endswith(".tif") or frame_file.endswith(".tiff") or frame_file.endswith(".eer"):
with filesystem.open(frame_file, "rb") as f:
# For some reason, just returning the tifffile.TiffFile object gives issues
return (frame_file, tifffile.TiffFile(f).pages)
else:
return None

# Open the images in parallel
with ThreadPoolExecutor() as executor:
headers = list(executor.map(open_frame, frames_files))
headers = [header for header in headers if header is not None]
if not headers:
pytest.skip("No file-format supported frames headers found")

return dict(headers)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get all the frames' header data, multithreaded to reduce network request wait time


# ==================================================================================================
# Gain fixtures
# ==================================================================================================


@pytest.fixture(scope="session")
def gain_mrc_header(gain_file: str, filesystem: FileSystemApi) -> Dict[str, MrcInterpreter]:
"""Get the mrc file headers for a gain file."""
if not gain_file.endswith(".mrc"):
pytest.skip(f"Not an mrc file, skipping mrc checks: {gain_file}")

return {gain_file: get_header(gain_file, filesystem)}


# ==================================================================================================
Expand All @@ -44,9 +123,9 @@ def get_headers(mrcfiles: List[str]) -> Dict[str, MrcInterpreter]:


Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from here down to the next comment, it's just refactoring the code knowing that there will only be one mrc file for these entities

@pytest.fixture(scope="session")
def tiltseries_mrc_headers(tiltseries_mrc_files: List[str]) -> Dict[str, MrcInterpreter]:
def tiltseries_mrc_header(tiltseries_mrc_file: str, filesystem: FileSystemApi) -> Dict[str, MrcInterpreter]:
"""Get the mrc file headers for a tilt series."""
return get_headers(tiltseries_mrc_files)
return get_header(tiltseries_mrc_file, filesystem)


@pytest.fixture(scope="session")
Expand All @@ -57,27 +136,16 @@ def tiltseries_metadata(tiltseries_meta_file: str, filesystem: FileSystemApi) ->


@pytest.fixture(scope="session")
def tiltseries_mdoc(tiltseries_mdoc_files: List[str], filesystem: FileSystemApi) -> pd.DataFrame:
def tiltseries_mdoc(tiltseries_mdoc_file: str, filesystem: FileSystemApi) -> pd.DataFrame:
"""Load the tiltseries mdoc."""
mdocs = {}

for file in tiltseries_mdoc_files:
with filesystem.open(file, "r") as f:
mdocs[file] = mdocfile.read(f)

return mdocs
return mdocfile.read(filesystem.localreadable(tiltseries_mdoc_file))


@pytest.fixture(scope="session")
def tiltseries_tlt(tiltseries_tlt_files: List[str], filesystem: FileSystemApi) -> pd.DataFrame:
def tiltseries_tlt(tiltseries_tlt_file: str, filesystem: FileSystemApi) -> pd.DataFrame:
manasaV3 marked this conversation as resolved.
Show resolved Hide resolved
"""Load the tiltseries tlt."""
tlt_files = {}

for file in tiltseries_tlt_files:
with filesystem.open(file, "r") as f:
tlt_files[file] = pd.read_csv(f, sep=r"\s+", header=None, names=["tilt_angle"])

return tlt_files
with filesystem.open(tiltseries_tlt_file, "r") as f:
return pd.read_csv(f, sep=r"\s+", header=None, names=["tilt_angle"])


# ==================================================================================================
Expand All @@ -86,9 +154,9 @@ def tiltseries_tlt(tiltseries_tlt_files: List[str], filesystem: FileSystemApi) -


@pytest.fixture(scope="session")
def canonical_tomo_mrc_headers(canonical_tomo_mrc_files: List[str]) -> Dict[str, MrcInterpreter]:
def canonical_tomo_mrc_headers(canonical_tomo_mrc_file: str, filesystem: FileSystemApi) -> Dict[str, MrcInterpreter]:
"""Get the mrc file headers for a tomogram."""
return get_headers(canonical_tomo_mrc_files)
return get_header(canonical_tomo_mrc_file, filesystem)


@pytest.fixture(scope="session")
Expand All @@ -104,63 +172,86 @@ def canonical_tomogram_metadata(canonical_tomo_meta_file: str, filesystem: FileS


@pytest.fixture(scope="session")
def annotation_metadata(annotation_meta_files: List[str], filesystem: FileSystemApi) -> Dict[str, Dict]:
"""Load the annotation metadata."""
def annotation_metadata(annotation_metadata_files: List[str], filesystem: FileSystemApi) -> Dict[str, Dict]:
"""Load the annotation metadata. Dictionary structure: metadata = {metadata_a_filename: Dict}, metadata_b_filename: ...}."""
metadata_objs = {}

for file in annotation_meta_files:
for file in annotation_metadata_files:
with filesystem.open(file, "r") as f:
metadata_objs[file] = json.load(f)

return metadata_objs


Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

helper function below to help with ndjson data retrieval (for instance seg, point, and oriented point annotations)

def get_annotations(annotation_files: Dict[str, List[str]], filesystem: FileSystemApi) -> Dict[str, Dict[str, Dict]]:
"""Load annotations."""
def get_ndjson_annotations(
annotation_files: List[str],
filesystem: FileSystemApi,
) -> Dict[str, List[Dict]]:
"""
A helper function to load ndjson annotations (a list of annotations, with each annotation being a Dict)
for point, oriented point, and instance segmentation annotations.
Dictionary structure: annotations = {annotation_a_filename: List[Dict], annotation_b_filename: List[Dict]}.
"""
annotations = {}
for annoname, files in annotation_files.items():
annotations[annoname] = {}
for file in files:
with filesystem.open(file, "r") as f:
annotations[annoname][file] = ndjson.load(f)
# Only need the annotation file to find the annotations.
for annotation_file in annotation_files:
with filesystem.open(annotation_file, "r") as f:
annotations[annotation_file] = ndjson.load(f)

return annotations


@pytest.fixture(scope="session")
def point_annotations(
point_annotation_files: Dict[str, List[str]],
point_annotation_files: List[str],
filesystem: FileSystemApi,
) -> Dict[str, Dict[str, Dict]]:
) -> Dict[str, List[Dict]]:
"""Load point annotations."""
return get_annotations(point_annotation_files, filesystem)
return get_ndjson_annotations(point_annotation_files, filesystem)


@pytest.fixture(scope="session")
def oriented_point_annotations(
oriented_point_annotation_files: Dict[str, List[str]],
oriented_point_annotation_files: List[str],
filesystem: FileSystemApi,
) -> Dict[str, Dict[str, Dict]]:
) -> Dict[str, List[Dict]]:
"""Load oriented point annotations."""
return get_annotations(oriented_point_annotation_files, filesystem)
return get_ndjson_annotations(oriented_point_annotation_files, filesystem)


@pytest.fixture(scope="session")
def instance_seg_annotations(
instance_seg_annotation_files: Dict[str, List[str]],
instance_seg_annotation_files: List[str],
filesystem: FileSystemApi,
) -> Dict[str, Dict[str, Dict]]:
) -> Dict[str, List[Dict]]:
"""Load instance segmentation annotations."""
return get_annotations(instance_seg_annotation_files, filesystem)
return get_ndjson_annotations(instance_seg_annotation_files, filesystem)


@pytest.fixture(scope="session")
def seg_mask_annotation_mrc_headers(
seg_mask_annotation_mrc_files: Dict[str, List[str]],
) -> Dict[str, Dict[str, MrcInterpreter]]:
seg_mask_annotation_mrc_files: List[str],
filesystem: FileSystemApi,
) -> Dict[str, MrcInterpreter]:
"""Get the mrc file headers for an mrc annotation file."""
annotations = {}
for annoname, files in seg_mask_annotation_mrc_files.items():
annotations[annoname] = get_headers(files)
headers = {}
for mrc_filename in seg_mask_annotation_mrc_files:
headers[mrc_filename] = get_header(mrc_filename, filesystem)

return annotations
return headers


@pytest.fixture(scope="session")
def seg_mask_annotation_zarr_headers(
seg_mask_annotation_zarr_files: List[str],
filesystem: FileSystemApi,
) -> Dict[str, Dict[str, Dict]]:
"""
Get the zattrs and zarray data for a zarr annotation file.
Dictionary structure: headers = {annotation_a_filename: {"zattrs": Dict, "zarrays": Dict}}, annotation_b_filename: ...}.
"""
headers = {}
for zarr_filename in seg_mask_annotation_zarr_files:
headers[zarr_filename] = get_zarr_headers(zarr_filename, filesystem)

return headers
Loading
Loading