chanzuckerberg · daniel-ji · Sep 3, 2024 · Aug 1, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/ingestion_tools/poetry.lock b/ingestion_tools/poetry.lock
diff --git a/ingestion_tools/pyproject.toml b/ingestion_tools/pyproject.toml
@@ -48,6 +48,7 @@ cryoet-data-portal-neuroglancer = { git = "https://github.com/chanzuckerberg/cry
 distinctipy = "^1.3.4"
 imodmodel = "0.0.12"
 mdocfile = "^0.1.2"
+tifffile = "^2024.8.10"
 
 [tool.black]
 line-length = 120

diff --git a/ingestion_tools/scripts/common/fs.py b/ingestion_tools/scripts/common/fs.py
@@ -34,6 +34,10 @@ def glob(self, *args: list[str]) -> list[str]:
     def open(self, path: str, mode: str) -> TextIOBase:
         pass
 
+    @abstractmethod
+    def size(self, path: str) -> int:
+        pass
+
     @abstractmethod
     def localreadable(self, path: str) -> str:
         pass
@@ -86,6 +90,9 @@ def glob(self, *args: list[str]) -> list[str]:
     def open(self, path: str, mode: str) -> TextIOBase:
         return self.s3fs.open(path, mode)
 
+    def size(self, path: str) -> int:
+        return self.s3fs.size(path)
+
     def localreadable(self, path: str) -> str:
         local_dest_file = os.path.join(self.tmpdir, path)
         # Don't re-download it if it's already available.
@@ -178,6 +185,9 @@ def glob(self, *args: list[str]) -> list[str]:
     def open(self, path: str, mode: str) -> TextIOBase:
         return open(path, mode)  # noqa
 
+    def size(self, path: str) -> int:
+        return os.path.getsize(path)
+
     def localreadable(self, path: str) -> str:
         return path
 

diff --git a/ingestion_tools/scripts/data_validation/__init__.py b/ingestion_tools/scripts/data_validation/__init__.py
@@ -0,0 +1,3 @@
+import pytest
+
+pytest.register_assert_rewrite("tests.helper_mrc", "tests.helper_images", "tests.annotation.helper_point")
diff --git a/ingestion_tools/scripts/data_validation/conftest.py b/ingestion_tools/scripts/data_validation/conftest.py
@@ -94,6 +94,9 @@ def pytest_configure(config: pytest.Config) -> None:
     # Register markers
     config.addinivalue_line("markers", "annotation: Tests concerning the annotation data.")
     config.addinivalue_line("markers", "dataset: Tests concerning the dataset.")
+    config.addinivalue_line("markers", "deposition: Tests concerning the deposition data.")
+    config.addinivalue_line("markers", "frame: Tests concerning the frames.")
+    config.addinivalue_line("markers", "gain: Tests concerning the gain files.")
     config.addinivalue_line("markers", "run: Tests concerning the runs.")
     config.addinivalue_line("markers", "tiltseries: Tests concerning the tiltseries.")
     config.addinivalue_line("markers", "tomogram: Tests concerning the tomogram.")

diff --git a/ingestion_tools/scripts/data_validation/fixtures/data.py b/ingestion_tools/scripts/data_validation/fixtures/data.py
@@ -3,13 +3,18 @@
 """
 
 import json
-from typing import Dict, List
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Union
 
 import mdocfile
 import ndjson
 import pandas as pd
 import pytest
+import tifffile
+import tifffile.tifffile
+import zarr
 from mrcfile.mrcinterpreter import MrcInterpreter
+from ome_zarr.io import ZarrLocation
 
 from common.fs import FileSystemApi
 
@@ -18,24 +23,98 @@
 # ==================================================================================================
 
 
-def get_header(mrcfile: str) -> MrcInterpreter:
-    """Get the mrc file header for a tilt series without downloading the entire file."""
-    fs = FileSystemApi.get_fs_api(mode="s3", force_overwrite=False)
-    with fs.open(mrcfile, "rb") as f:
-        header = MrcInterpreter(iostream=f, permissive=True, header_only=True)
-    return header
+def get_header(mrcfile: str, fs: FileSystemApi) -> MrcInterpreter:
+    """Get the mrc file headers for a list of mrc files."""
+    try:
+        with fs.open(mrcfile, "rb") as f:
+            header = MrcInterpreter(iostream=f, permissive=True, header_only=True)
+        return header
+    except Exception as e:
+        pytest.fail(f"Failed to get header for {mrcfile}: {e}")
+
+
+def get_zarr_headers(zarrfile: str, fs: FileSystemApi) -> Dict[str, Dict]:
+    """Get the zattrs and zarray data for a zarr volume file."""
+    expected_subfolders = {f"{zarrfile}/{i}" for i in range(3)}.union({f"{zarrfile}/.zattrs", f"{zarrfile}/.zgroup"})
+    actual_subfolders = {"s3://" + folder for folder in fs.glob(zarrfile + "/*")}
+    if expected_subfolders != actual_subfolders:
+        pytest.fail(f"Expected zarr subfolders: {expected_subfolders}, Actual zarr subfolders: {actual_subfolders}")
+
+    fsstore = zarr.storage.FSStore(url=zarrfile, mode="r", fs=fs.s3fs, dimension_separator="/")
+    fsstore_subfolders = set(fsstore.listdir())
+    expected_fsstore_subfolders = {str(i) for i in range(3)}.union({".zattrs", ".zgroup"})
+    if expected_fsstore_subfolders != fsstore_subfolders:
+        pytest.fail(f"Expected zarr subfolders: {expected_subfolders}, Actual zarr subfolders: {fsstore_subfolders}")
+
+    loc = ZarrLocation(fsstore)
+    zarrays = {}
+    for i in range(3):
+        zarrays[i] = json.loads(fsstore[str(i) + "/.zarray"].decode())
+    return {"zattrs": loc.root_attrs, "zarrays": zarrays}
 
 
-def get_headers(mrcfiles: List[str]) -> Dict[str, MrcInterpreter]:
-    """Get the mrc file headers for a list of mrc files."""
-    headers = {}
-    for file in mrcfiles:
-        try:
-            headers[file] = get_header(file)
-        except Exception as _:
-            pytest.fail(f"Failed to get header for {file}")
+# ==================================================================================================
+# Dataset fixtures
+# ==================================================================================================
 
-    return headers
+
+@pytest.fixture(scope="session")
+def dataset_metadata(dataset_metadata_file: str, filesystem: FileSystemApi) -> Dict:
+    """Load the dataset metadata."""
+    with filesystem.open(dataset_metadata_file, "r") as f:
+        return json.load(f)
+
+
+# ==================================================================================================
+# Frame fixtures
+# ==================================================================================================
+
+
+@pytest.fixture(scope="session")
+def frames_filesizes(frames_files: List[str], filesystem: FileSystemApi) -> Dict[str, int]:
+    """Get the file sizes for a list of frame files."""
+    return {frame_file: filesystem.size(frame_file) for frame_file in frames_files}
+
+
+@pytest.fixture(scope="session")
+def frames_headers(
+    frames_files: List[str],
+    filesystem: FileSystemApi,
+) -> Dict[str, Union[tifffile.TiffPages, MrcInterpreter]]:
+    """Get the headers for a list of frame files."""
+
+    def open_frame(frame_file: str):
+        if frame_file.endswith(".mrc"):
+            return (frame_file, get_header(frame_file, filesystem))
+        elif frame_file.endswith(".tif") or frame_file.endswith(".tiff") or frame_file.endswith(".eer"):
+            with filesystem.open(frame_file, "rb") as f:
+                # For some reason, just returning the tifffile.TiffFile object gives issues
+                return (frame_file, tifffile.TiffFile(f).pages)
+        else:
+            return None
+
+    # Open the images in parallel
+    with ThreadPoolExecutor() as executor:
+        headers = list(executor.map(open_frame, frames_files))
+        headers = [header for header in headers if header is not None]
+        if not headers:
+            pytest.skip("No file-format supported frames headers found")
+
+    return dict(headers)
+
+
+# ==================================================================================================
+# Gain fixtures
+# ==================================================================================================
+
+
+@pytest.fixture(scope="session")
+def gain_mrc_header(gain_file: str, filesystem: FileSystemApi) -> Dict[str, MrcInterpreter]:
+    """Get the mrc file headers for a gain file."""
+    if not gain_file.endswith(".mrc"):
+        pytest.skip(f"Not an mrc file, skipping mrc checks: {gain_file}")
+
+    return {gain_file: get_header(gain_file, filesystem)}
 
 
 # ==================================================================================================
@@ -44,9 +123,9 @@ def get_headers(mrcfiles: List[str]) -> Dict[str, MrcInterpreter]:
 
 
 @pytest.fixture(scope="session")
-def tiltseries_mrc_headers(tiltseries_mrc_files: List[str]) -> Dict[str, MrcInterpreter]:
+def tiltseries_mrc_header(tiltseries_mrc_file: str, filesystem: FileSystemApi) -> Dict[str, MrcInterpreter]:
     """Get the mrc file headers for a tilt series."""
-    return get_headers(tiltseries_mrc_files)
+    return get_header(tiltseries_mrc_file, filesystem)
 
 
 @pytest.fixture(scope="session")
@@ -57,27 +136,16 @@ def tiltseries_metadata(tiltseries_meta_file: str, filesystem: FileSystemApi) ->
 
 
 @pytest.fixture(scope="session")
-def tiltseries_mdoc(tiltseries_mdoc_files: List[str], filesystem: FileSystemApi) -> pd.DataFrame:
+def tiltseries_mdoc(tiltseries_mdoc_file: str, filesystem: FileSystemApi) -> pd.DataFrame:
     """Load the tiltseries mdoc."""
-    mdocs = {}
-
-    for file in tiltseries_mdoc_files:
-        with filesystem.open(file, "r") as f:
-            mdocs[file] = mdocfile.read(f)
-
-    return mdocs
+    return mdocfile.read(filesystem.localreadable(tiltseries_mdoc_file))
 
 
 @pytest.fixture(scope="session")
-def tiltseries_tlt(tiltseries_tlt_files: List[str], filesystem: FileSystemApi) -> pd.DataFrame:
+def tiltseries_tlt(tiltseries_tlt_file: str, filesystem: FileSystemApi) -> pd.DataFrame:
     """Load the tiltseries tlt."""
-    tlt_files = {}
-
-    for file in tiltseries_tlt_files:
-        with filesystem.open(file, "r") as f:
-            tlt_files[file] = pd.read_csv(f, sep=r"\s+", header=None, names=["tilt_angle"])
-
-    return tlt_files
+    with filesystem.open(tiltseries_tlt_file, "r") as f:
+        return pd.read_csv(f, sep=r"\s+", header=None, names=["tilt_angle"])
 
 
 # ==================================================================================================
@@ -86,9 +154,9 @@ def tiltseries_tlt(tiltseries_tlt_files: List[str], filesystem: FileSystemApi) -
 
 
 @pytest.fixture(scope="session")
-def canonical_tomo_mrc_headers(canonical_tomo_mrc_files: List[str]) -> Dict[str, MrcInterpreter]:
+def canonical_tomo_mrc_headers(canonical_tomo_mrc_file: str, filesystem: FileSystemApi) -> Dict[str, MrcInterpreter]:
     """Get the mrc file headers for a tomogram."""
-    return get_headers(canonical_tomo_mrc_files)
+    return get_header(canonical_tomo_mrc_file, filesystem)
 
 
 @pytest.fixture(scope="session")
@@ -104,63 +172,86 @@ def canonical_tomogram_metadata(canonical_tomo_meta_file: str, filesystem: FileS
 
 
 @pytest.fixture(scope="session")
-def annotation_metadata(annotation_meta_files: List[str], filesystem: FileSystemApi) -> Dict[str, Dict]:
-    """Load the annotation metadata."""
+def annotation_metadata(annotation_metadata_files: List[str], filesystem: FileSystemApi) -> Dict[str, Dict]:
+    """Load the annotation metadata. Dictionary structure: metadata = {metadata_a_filename: Dict}, metadata_b_filename: ...}."""
     metadata_objs = {}
 
-    for file in annotation_meta_files:
+    for file in annotation_metadata_files:
         with filesystem.open(file, "r") as f:
             metadata_objs[file] = json.load(f)
 
     return metadata_objs
 
 
-def get_annotations(annotation_files: Dict[str, List[str]], filesystem: FileSystemApi) -> Dict[str, Dict[str, Dict]]:
-    """Load annotations."""
+def get_ndjson_annotations(
+    annotation_files: List[str],
+    filesystem: FileSystemApi,
+) -> Dict[str, List[Dict]]:
+    """
+    A helper function to load ndjson annotations (a list of annotations, with each annotation being a Dict)
+    for point, oriented point, and instance segmentation annotations.
+    Dictionary structure: annotations = {annotation_a_filename: List[Dict], annotation_b_filename: List[Dict]}.
+    """
     annotations = {}
-    for annoname, files in annotation_files.items():
-        annotations[annoname] = {}
-        for file in files:
-            with filesystem.open(file, "r") as f:
-                annotations[annoname][file] = ndjson.load(f)
+    # Only need the annotation file to find the annotations.
+    for annotation_file in annotation_files:
+        with filesystem.open(annotation_file, "r") as f:
+            annotations[annotation_file] = ndjson.load(f)
 
     return annotations
 
 
 @pytest.fixture(scope="session")
 def point_annotations(
-    point_annotation_files: Dict[str, List[str]],
+    point_annotation_files: List[str],
     filesystem: FileSystemApi,
-) -> Dict[str, Dict[str, Dict]]:
+) -> Dict[str, List[Dict]]:
     """Load point annotations."""
-    return get_annotations(point_annotation_files, filesystem)
+    return get_ndjson_annotations(point_annotation_files, filesystem)
 
 
 @pytest.fixture(scope="session")
 def oriented_point_annotations(
-    oriented_point_annotation_files: Dict[str, List[str]],
+    oriented_point_annotation_files: List[str],
     filesystem: FileSystemApi,
-) -> Dict[str, Dict[str, Dict]]:
+) -> Dict[str, List[Dict]]:
     """Load oriented point annotations."""
-    return get_annotations(oriented_point_annotation_files, filesystem)
+    return get_ndjson_annotations(oriented_point_annotation_files, filesystem)
 
 
 @pytest.fixture(scope="session")
 def instance_seg_annotations(
-    instance_seg_annotation_files: Dict[str, List[str]],
+    instance_seg_annotation_files: List[str],
     filesystem: FileSystemApi,
-) -> Dict[str, Dict[str, Dict]]:
+) -> Dict[str, List[Dict]]:
     """Load instance segmentation annotations."""
-    return get_annotations(instance_seg_annotation_files, filesystem)
+    return get_ndjson_annotations(instance_seg_annotation_files, filesystem)
 
 
 @pytest.fixture(scope="session")
 def seg_mask_annotation_mrc_headers(
-    seg_mask_annotation_mrc_files: Dict[str, List[str]],
-) -> Dict[str, Dict[str, MrcInterpreter]]:
+    seg_mask_annotation_mrc_files: List[str],
+    filesystem: FileSystemApi,
+) -> Dict[str, MrcInterpreter]:
     """Get the mrc file headers for an mrc annotation file."""
-    annotations = {}
-    for annoname, files in seg_mask_annotation_mrc_files.items():
-        annotations[annoname] = get_headers(files)
+    headers = {}
+    for mrc_filename in seg_mask_annotation_mrc_files:
+        headers[mrc_filename] = get_header(mrc_filename, filesystem)
 
-    return annotations
+    return headers
+
+
+@pytest.fixture(scope="session")
+def seg_mask_annotation_zarr_headers(
+    seg_mask_annotation_zarr_files: List[str],
+    filesystem: FileSystemApi,
+) -> Dict[str, Dict[str, Dict]]:
+    """
+    Get the zattrs and zarray data for a zarr annotation file.
+    Dictionary structure: headers = {annotation_a_filename: {"zattrs": Dict, "zarrays": Dict}}, annotation_b_filename: ...}.
+    """
+    headers = {}
+    for zarr_filename in seg_mask_annotation_zarr_files:
+        headers[zarr_filename] = get_zarr_headers(zarr_filename, filesystem)
+
+    return headers