-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: s3 data validation: annotations, frames, gains #207
Changes from 26 commits
b5c499c
ad042c5
c6ebe05
ce746ca
d610b51
42cafc9
8843332
c8d7ec2
fcd6598
9acdfb7
1c2ab50
3bdfbb1
d1ee0fe
aed8da0
753b310
7472974
065458b
9c147b4
f6de8b9
648d44d
5ed945c
1456a5a
5284505
fee8f88
7aa6a2d
c044270
75708bb
58e843d
f3c3b12
cf7ac52
c149712
2a307da
3cbddf3
df2fd8e
bb01a3c
4e26e40
ecbbd60
67aa2f3
6b06e30
a7f8f30
2aa9fd1
0ed792d
251214b
f1e934f
5a0566c
f1b032f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
manasaV3 marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,10 @@ def glob(self, *args: list[str]) -> list[str]: | |
def open(self, path: str, mode: str) -> TextIOBase: | ||
pass | ||
|
||
@abstractmethod | ||
def size(self, path: str) -> int: | ||
pass | ||
|
||
@abstractmethod | ||
def localreadable(self, path: str) -> str: | ||
pass | ||
|
@@ -86,6 +90,9 @@ def glob(self, *args: list[str]) -> list[str]: | |
def open(self, path: str, mode: str) -> TextIOBase: | ||
return self.s3fs.open(path, mode) | ||
|
||
def size(self, path: str) -> int: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this work for directories as well? If not we should call it |
||
return self.s3fs.size(path) | ||
|
||
def localreadable(self, path: str) -> str: | ||
local_dest_file = os.path.join(self.tmpdir, path) | ||
# Don't re-download it if it's already available. | ||
|
@@ -178,6 +185,9 @@ def glob(self, *args: list[str]) -> list[str]: | |
def open(self, path: str, mode: str) -> TextIOBase: | ||
return open(path, mode) # noqa | ||
|
||
def size(self, path: str) -> int: | ||
return os.path.getsize(path) | ||
|
||
def localreadable(self, path: str) -> str: | ||
return path | ||
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For better assertion errors, https://stackoverflow.com/questions/41522767/pytest-assert-introspection-in-helper-function |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import pytest | ||
|
||
pytest.register_assert_rewrite("tests.helper_mrc", "tests.helper_images", "tests.annotation.helper_point") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lets add a comment here to explain why this is needed. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,18 @@ | |
""" | ||
|
||
import json | ||
from typing import Dict, List | ||
from concurrent.futures import ThreadPoolExecutor | ||
from typing import Dict, List, Union | ||
|
||
import mdocfile | ||
import ndjson | ||
import pandas as pd | ||
import pytest | ||
import tifffile | ||
import tifffile.tifffile | ||
import zarr | ||
from mrcfile.mrcinterpreter import MrcInterpreter | ||
from ome_zarr.io import ZarrLocation | ||
|
||
from common.fs import FileSystemApi | ||
|
||
|
@@ -18,24 +23,98 @@ | |
# ================================================================================================== | ||
|
||
|
||
def get_header(mrcfile: str) -> MrcInterpreter: | ||
"""Get the mrc file header for a tilt series without downloading the entire file.""" | ||
fs = FileSystemApi.get_fs_api(mode="s3", force_overwrite=False) | ||
with fs.open(mrcfile, "rb") as f: | ||
header = MrcInterpreter(iostream=f, permissive=True, header_only=True) | ||
return header | ||
def get_header(mrcfile: str, fs: FileSystemApi) -> MrcInterpreter: | ||
"""Get the mrc file headers for a list of mrc files.""" | ||
try: | ||
with fs.open(mrcfile, "rb") as f: | ||
header = MrcInterpreter(iostream=f, permissive=True, header_only=True) | ||
return header | ||
except Exception as e: | ||
pytest.fail(f"Failed to get header for {mrcfile}: {e}") | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ^ improve the error messages on the get_header function |
||
|
||
def get_zarr_headers(zarrfile: str, fs: FileSystemApi) -> Dict[str, Dict]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should probably call this, getting zarr metadata as opposed to header. |
||
"""Get the zattrs and zarray data for a zarr volume file.""" | ||
expected_subfolders = {f"{zarrfile}/{i}" for i in range(3)}.union({f"{zarrfile}/.zattrs", f"{zarrfile}/.zgroup"}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can change the variable name to be children maybe? |
||
actual_subfolders = {"s3://" + folder for folder in fs.glob(zarrfile + "/*")} | ||
if expected_subfolders != actual_subfolders: | ||
pytest.fail(f"Expected zarr subfolders: {expected_subfolders}, Actual zarr subfolders: {actual_subfolders}") | ||
|
||
fsstore = zarr.storage.FSStore(url=zarrfile, mode="r", fs=fs.s3fs, dimension_separator="/") | ||
manasaV3 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
fsstore_subfolders = set(fsstore.listdir()) | ||
expected_fsstore_subfolders = {str(i) for i in range(3)}.union({".zattrs", ".zgroup"}) | ||
if expected_fsstore_subfolders != fsstore_subfolders: | ||
pytest.fail(f"Expected zarr subfolders: {expected_subfolders}, Actual zarr subfolders: {fsstore_subfolders}") | ||
|
||
loc = ZarrLocation(fsstore) | ||
zarrays = {} | ||
for i in range(3): | ||
zarrays[i] = json.loads(fsstore[str(i) + "/.zarray"].decode()) | ||
return {"zattrs": loc.root_attrs, "zarrays": zarrays} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ^ gets zarr header data, while ensuring all the data is properly retrieved |
||
|
||
def get_headers(mrcfiles: List[str]) -> Dict[str, MrcInterpreter]: | ||
"""Get the mrc file headers for a list of mrc files.""" | ||
headers = {} | ||
for file in mrcfiles: | ||
try: | ||
headers[file] = get_header(file) | ||
except Exception as _: | ||
pytest.fail(f"Failed to get header for {file}") | ||
# ================================================================================================== | ||
# Dataset fixtures | ||
# ================================================================================================== | ||
|
||
return headers | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to retrieve multiple headers, since we never have multiple mrc files for a given entity |
||
|
||
@pytest.fixture(scope="session") | ||
def dataset_metadata(dataset_metadata_file: str, filesystem: FileSystemApi) -> Dict: | ||
"""Load the dataset metadata.""" | ||
with filesystem.open(dataset_metadata_file, "r") as f: | ||
return json.load(f) | ||
|
||
|
||
# ================================================================================================== | ||
# Frame fixtures | ||
# ================================================================================================== | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def frames_filesizes(frames_files: List[str], filesystem: FileSystemApi) -> Dict[str, int]: | ||
"""Get the file sizes for a list of frame files.""" | ||
return {frame_file: filesystem.size(frame_file) for frame_file in frames_files} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might not get a lot of value from this test, as a non-zero file size doesn't guarantee the validity of the data. |
||
|
||
|
||
@pytest.fixture(scope="session") | ||
def frames_headers( | ||
frames_files: List[str], | ||
filesystem: FileSystemApi, | ||
) -> Dict[str, Union[tifffile.TiffPages, MrcInterpreter]]: | ||
"""Get the headers for a list of frame files.""" | ||
|
||
def open_frame(frame_file: str): | ||
if frame_file.endswith(".mrc"): | ||
return (frame_file, get_header(frame_file, filesystem)) | ||
elif frame_file.endswith(".tif") or frame_file.endswith(".tiff") or frame_file.endswith(".eer"): | ||
with filesystem.open(frame_file, "rb") as f: | ||
# For some reason, just returning the tifffile.TiffFile object gives issues | ||
return (frame_file, tifffile.TiffFile(f).pages) | ||
else: | ||
return None | ||
|
||
# Open the images in parallel | ||
with ThreadPoolExecutor() as executor: | ||
headers = list(executor.map(open_frame, frames_files)) | ||
headers = [header for header in headers if header is not None] | ||
if not headers: | ||
pytest.skip("No file-format supported frames headers found") | ||
|
||
return dict(headers) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. get all the frames' header data, multithreaded to reduce network request wait time |
||
|
||
# ================================================================================================== | ||
# Gain fixtures | ||
# ================================================================================================== | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def gain_mrc_header(gain_file: str, filesystem: FileSystemApi) -> Dict[str, MrcInterpreter]: | ||
"""Get the mrc file headers for a gain file.""" | ||
if not gain_file.endswith(".mrc"): | ||
pytest.skip(f"Not an mrc file, skipping mrc checks: {gain_file}") | ||
|
||
return {gain_file: get_header(gain_file, filesystem)} | ||
|
||
|
||
# ================================================================================================== | ||
|
@@ -44,9 +123,9 @@ def get_headers(mrcfiles: List[str]) -> Dict[str, MrcInterpreter]: | |
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. from here down to the next comment, it's just refactoring the code knowing that there will only be one mrc file for these entities |
||
@pytest.fixture(scope="session") | ||
def tiltseries_mrc_headers(tiltseries_mrc_files: List[str]) -> Dict[str, MrcInterpreter]: | ||
def tiltseries_mrc_header(tiltseries_mrc_file: str, filesystem: FileSystemApi) -> Dict[str, MrcInterpreter]: | ||
"""Get the mrc file headers for a tilt series.""" | ||
return get_headers(tiltseries_mrc_files) | ||
return get_header(tiltseries_mrc_file, filesystem) | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
|
@@ -57,27 +136,16 @@ def tiltseries_metadata(tiltseries_meta_file: str, filesystem: FileSystemApi) -> | |
|
||
|
||
@pytest.fixture(scope="session") | ||
def tiltseries_mdoc(tiltseries_mdoc_files: List[str], filesystem: FileSystemApi) -> pd.DataFrame: | ||
def tiltseries_mdoc(tiltseries_mdoc_file: str, filesystem: FileSystemApi) -> pd.DataFrame: | ||
"""Load the tiltseries mdoc.""" | ||
mdocs = {} | ||
|
||
for file in tiltseries_mdoc_files: | ||
with filesystem.open(file, "r") as f: | ||
mdocs[file] = mdocfile.read(f) | ||
|
||
return mdocs | ||
return mdocfile.read(filesystem.localreadable(tiltseries_mdoc_file)) | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def tiltseries_tlt(tiltseries_tlt_files: List[str], filesystem: FileSystemApi) -> pd.DataFrame: | ||
def tiltseries_tlt(tiltseries_tlt_file: str, filesystem: FileSystemApi) -> pd.DataFrame: | ||
manasaV3 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Load the tiltseries tlt.""" | ||
tlt_files = {} | ||
|
||
for file in tiltseries_tlt_files: | ||
with filesystem.open(file, "r") as f: | ||
tlt_files[file] = pd.read_csv(f, sep=r"\s+", header=None, names=["tilt_angle"]) | ||
|
||
return tlt_files | ||
with filesystem.open(tiltseries_tlt_file, "r") as f: | ||
return pd.read_csv(f, sep=r"\s+", header=None, names=["tilt_angle"]) | ||
|
||
|
||
# ================================================================================================== | ||
|
@@ -86,9 +154,9 @@ def tiltseries_tlt(tiltseries_tlt_files: List[str], filesystem: FileSystemApi) - | |
|
||
|
||
@pytest.fixture(scope="session") | ||
def canonical_tomo_mrc_headers(canonical_tomo_mrc_files: List[str]) -> Dict[str, MrcInterpreter]: | ||
def canonical_tomo_mrc_headers(canonical_tomo_mrc_file: str, filesystem: FileSystemApi) -> Dict[str, MrcInterpreter]: | ||
"""Get the mrc file headers for a tomogram.""" | ||
return get_headers(canonical_tomo_mrc_files) | ||
return get_header(canonical_tomo_mrc_file, filesystem) | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
|
@@ -104,63 +172,86 @@ def canonical_tomogram_metadata(canonical_tomo_meta_file: str, filesystem: FileS | |
|
||
|
||
@pytest.fixture(scope="session") | ||
def annotation_metadata(annotation_meta_files: List[str], filesystem: FileSystemApi) -> Dict[str, Dict]: | ||
"""Load the annotation metadata.""" | ||
def annotation_metadata(annotation_metadata_files: List[str], filesystem: FileSystemApi) -> Dict[str, Dict]: | ||
"""Load the annotation metadata. Dictionary structure: metadata = {metadata_a_filename: Dict}, metadata_b_filename: ...}.""" | ||
metadata_objs = {} | ||
|
||
for file in annotation_meta_files: | ||
for file in annotation_metadata_files: | ||
with filesystem.open(file, "r") as f: | ||
metadata_objs[file] = json.load(f) | ||
|
||
return metadata_objs | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. helper function below to help with ndjson data retrieval (for instance seg, point, and oriented point annotations) |
||
def get_annotations(annotation_files: Dict[str, List[str]], filesystem: FileSystemApi) -> Dict[str, Dict[str, Dict]]: | ||
"""Load annotations.""" | ||
def get_ndjson_annotations( | ||
annotation_files: List[str], | ||
filesystem: FileSystemApi, | ||
) -> Dict[str, List[Dict]]: | ||
""" | ||
A helper function to load ndjson annotations (a list of annotations, with each annotation being a Dict) | ||
for point, oriented point, and instance segmentation annotations. | ||
Dictionary structure: annotations = {annotation_a_filename: List[Dict], annotation_b_filename: List[Dict]}. | ||
""" | ||
annotations = {} | ||
for annoname, files in annotation_files.items(): | ||
annotations[annoname] = {} | ||
for file in files: | ||
with filesystem.open(file, "r") as f: | ||
annotations[annoname][file] = ndjson.load(f) | ||
# Only need the annotation file to find the annotations. | ||
for annotation_file in annotation_files: | ||
with filesystem.open(annotation_file, "r") as f: | ||
annotations[annotation_file] = ndjson.load(f) | ||
|
||
return annotations | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def point_annotations( | ||
point_annotation_files: Dict[str, List[str]], | ||
point_annotation_files: List[str], | ||
filesystem: FileSystemApi, | ||
) -> Dict[str, Dict[str, Dict]]: | ||
) -> Dict[str, List[Dict]]: | ||
"""Load point annotations.""" | ||
return get_annotations(point_annotation_files, filesystem) | ||
return get_ndjson_annotations(point_annotation_files, filesystem) | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def oriented_point_annotations( | ||
oriented_point_annotation_files: Dict[str, List[str]], | ||
oriented_point_annotation_files: List[str], | ||
filesystem: FileSystemApi, | ||
) -> Dict[str, Dict[str, Dict]]: | ||
) -> Dict[str, List[Dict]]: | ||
"""Load oriented point annotations.""" | ||
return get_annotations(oriented_point_annotation_files, filesystem) | ||
return get_ndjson_annotations(oriented_point_annotation_files, filesystem) | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def instance_seg_annotations( | ||
instance_seg_annotation_files: Dict[str, List[str]], | ||
instance_seg_annotation_files: List[str], | ||
filesystem: FileSystemApi, | ||
) -> Dict[str, Dict[str, Dict]]: | ||
) -> Dict[str, List[Dict]]: | ||
"""Load instance segmentation annotations.""" | ||
return get_annotations(instance_seg_annotation_files, filesystem) | ||
return get_ndjson_annotations(instance_seg_annotation_files, filesystem) | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def seg_mask_annotation_mrc_headers( | ||
seg_mask_annotation_mrc_files: Dict[str, List[str]], | ||
) -> Dict[str, Dict[str, MrcInterpreter]]: | ||
seg_mask_annotation_mrc_files: List[str], | ||
filesystem: FileSystemApi, | ||
) -> Dict[str, MrcInterpreter]: | ||
"""Get the mrc file headers for an mrc annotation file.""" | ||
annotations = {} | ||
for annoname, files in seg_mask_annotation_mrc_files.items(): | ||
annotations[annoname] = get_headers(files) | ||
headers = {} | ||
for mrc_filename in seg_mask_annotation_mrc_files: | ||
headers[mrc_filename] = get_header(mrc_filename, filesystem) | ||
|
||
return annotations | ||
return headers | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def seg_mask_annotation_zarr_headers( | ||
seg_mask_annotation_zarr_files: List[str], | ||
filesystem: FileSystemApi, | ||
) -> Dict[str, Dict[str, Dict]]: | ||
""" | ||
Get the zattrs and zarray data for a zarr annotation file. | ||
Dictionary structure: headers = {annotation_a_filename: {"zattrs": Dict, "zarrays": Dict}}, annotation_b_filename: ...}. | ||
""" | ||
headers = {} | ||
for zarr_filename in seg_mask_annotation_zarr_files: | ||
headers[zarr_filename] = get_zarr_headers(zarr_filename, filesystem) | ||
|
||
return headers |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For reading frames files (*.tiff / *.eer)