broadinstitute · snwessel · Jul 23, 2024 · Jul 22, 2024 · Jul 22, 2024
diff --git a/breadbox/breadbox/compute/analysis_tasks.py b/breadbox/breadbox/compute/analysis_tasks.py
@@ -440,8 +440,8 @@ def create_cell_line_group(
             depmap_model_sample_type = get_dimension_type(db, "depmap_model")
 
             (
-                feature_labels_and_aliases,
-                sample_labels_and_aliases,
+                feature_given_id_and_index_df,
+                sample_given_id_and_index_df,
                 feature_warnings,
                 sample_warnings,
             ) = dataclasses.astuple(
@@ -481,8 +481,8 @@ def create_cell_line_group(
                 db,
                 user,
                 dataset_in,
-                feature_labels_and_aliases,
-                sample_labels_and_aliases,
+                feature_given_id_and_index_df,
+                sample_given_id_and_index_df,
                 generic_feature_type,
                 depmap_model_sample_type,
             )

diff --git a/breadbox/breadbox/compute/dataset_tasks.py b/breadbox/breadbox/compute/dataset_tasks.py
@@ -60,26 +60,6 @@ def get_file_dict(datafile) -> FileDict:
     return data_file_dict
 
 
-def _get_transient_dataset_feature_cat_node_id(db, user, added_dataset) -> int:
-    feature_cat_nodes = (
-        db.query(CatalogNode)
-        .join(DatasetFeature, DatasetFeature.id == CatalogNode.dimension_id)
-        .filter(
-            and_(
-                CatalogNode.dataset_id == added_dataset.id,
-                CatalogNode.dimension_id.isnot(None),
-            )
-        )
-        .all()
-    )
-    if len(feature_cat_nodes) != 1:
-        raise HTTPException(
-            400,
-            f"Unexpected number of features for a transient dataset. Expected 1, found {len(feature_cat_nodes)}.",
-        )
-    return feature_cat_nodes[0].id
-
-
 # Convert the FileDict back to an UploadFile after it is passed through redis.
 # This is a general function used for data_file
 def _get_upload_file_from_file_dict(file_dict: FileDict) -> UploadFile:
@@ -238,8 +218,8 @@ def upload_dataset(
 
     try:
         (
-            feature_labels_and_aliases,
-            sample_labels_and_aliases,
+            feature_given_id_and_index_df,
+            sample_given_id_and_index_df,
             feature_warnings,
             sample_warnings,
         ) = dataclasses.astuple(
@@ -280,8 +260,8 @@ def upload_dataset(
         db,
         user,
         dataset,
-        feature_labels_and_aliases,
-        sample_labels_and_aliases,
+        feature_given_id_and_index_df,
+        sample_given_id_and_index_df,
         valid_fields.valid_feature_type,
         valid_fields.valid_sample_type,
     )
@@ -310,21 +290,6 @@ def upload_dataset(
         warning = feature_warning + sample_warning
 
     forwardingUrl = None
-    slice_id = ""
-    if is_transient:
-        feature_cat_node_id = _get_transient_dataset_feature_cat_node_id(
-            db, user, added_dataset
-        )
-        slice_id = str(feature_cat_node_id)
-
-        from urllib.parse import urlencode
-
-        params = {
-            "x": feature_cat_node_id,
-            "y": feature_cat_node_id,
-            "defaultCustomAnalysisToX": True,
-        }
-        forwardingUrl = "/elara?" + urlencode(params)
 
     if update_message:
         update_message("Wrapping up upload...")
@@ -336,7 +301,6 @@ def upload_dataset(
         datasetId=str(added_dataset.id),
         warnings=[warning],
         forwardingUrl=forwardingUrl,
-        sliceId=slice_id,
     )
 
 

diff --git a/breadbox/breadbox/crud/dataset.py b/breadbox/breadbox/crud/dataset.py
@@ -187,8 +187,8 @@ def add_matrix_dataset(
     db: SessionWithUser,
     user: str,
     dataset_in: MatrixDatasetIn,
-    feature_labels_and_aliases: pd.DataFrame,
-    sample_labels_and_aliases: pd.DataFrame,
+    feature_given_id_and_index_df: pd.DataFrame,
+    sample_given_id_and_index_df: pd.DataFrame,
     feature_type: Optional[DimensionType],
     sample_type: DimensionType,
 ):
@@ -243,18 +243,16 @@ def is_binary_category(allowed_values_list):
 
     _add_matrix_dataset_dimensions(
         db=db,
-        index_and_given_id=feature_labels_and_aliases,
+        index_and_given_id=feature_given_id_and_index_df,
         dimension_subtype_cls=DatasetFeature,
         dimension_type_name=feature_type.name if feature_type else None,
-        dataset_catalog_node=dataset_catalog_node,
         dataset=dataset,
     )
     _add_matrix_dataset_dimensions(
         db=db,
-        index_and_given_id=sample_labels_and_aliases,
+        index_and_given_id=sample_given_id_and_index_df,
         dimension_subtype_cls=DatasetSample,
         dimension_type_name=sample_type.name,
-        dataset_catalog_node=dataset_catalog_node,
         dataset=dataset,
     )
 
@@ -656,7 +654,6 @@ def _add_matrix_dataset_dimensions(
     index_and_given_id: pd.DataFrame,
     dimension_subtype_cls: Union[Type[DatasetFeature], Type[DatasetSample]],
     dimension_type_name: Optional[str],
-    dataset_catalog_node: CatalogNode,
     dataset: MatrixDataset,
 ):
     dimensions: List[Union[DatasetFeature, DatasetSample]] = []
@@ -681,19 +678,6 @@ def _add_matrix_dataset_dimensions(
             )
         )
 
-        # `given_id` is being passed in here for the label of the catalog node. This is technically not how
-        # catalog nodes were originally intended to work, but:
-        #    1. we should be eliminating catalog nodes in the future
-        #    2. catalog nodes no longer have any UI impact
-        #    3. removing catalog nodes at this time would require a decent amount of refactoring
-        # This allows us to keep catalog nodes until we're ready to remove them, while at the same time
-        # avoiding catalog nodes depending on labels. (Metadata can now change after datasets are loaded,
-        # so any labels run the risk of being out of date)
-        dimension_catalog_nodes = _create_dataset_dimension_catalog_nodes(
-            row, dimension_id, given_id, dataset_catalog_node,
-        )
-        catalog_nodes.extend(dimension_catalog_nodes)
-
     db.bulk_save_objects(dimensions)
     db.flush()
     add_catalog_nodes(db, catalog_nodes)
@@ -1412,34 +1396,6 @@ def delete_dataset(
     return True
 
 
-def _create_dataset_dimension_catalog_nodes(
-    dimension_row: pd.Series,
-    dimension_id: str,
-    dimension_label: str,
-    dataset_catalog_node: CatalogNode,
-) -> List[CatalogNode]:
-    catalog_nodes: List[CatalogNode] = []
-
-    aliases_set = set()
-    aliases_set.add(dimension_label)
-
-    for alias in aliases_set:
-        catalog_nodes.append(
-            CatalogNode(
-                dataset_id=dataset_catalog_node.dataset_id,
-                dimension_id=dimension_id,
-                priority=0,
-                parent=dataset_catalog_node,
-                label=alias,
-                is_continuous=dataset_catalog_node.is_continuous,
-                is_categorical=dataset_catalog_node.is_categorical,
-                is_binary=dataset_catalog_node.is_binary,
-                is_text=dataset_catalog_node.is_text,
-            )
-        )
-    return catalog_nodes
-
-
 def add_catalog_nodes(db: SessionWithUser, catalog_nodes: List[CatalogNode]):
     for i in range(0, len(catalog_nodes), 10000):  # arbitrary chunk size
         chunk = i + 10000

diff --git a/breadbox/breadbox/crud/types.py b/breadbox/breadbox/crud/types.py
@@ -12,7 +12,6 @@
 from breadbox.crud.dataset import (
     ROOT_ID,
     add_catalog_nodes,
-    _create_dataset_dimension_catalog_nodes,
     delete_dataset,
     get_properties_to_index,
     populate_search_index,
@@ -367,16 +366,6 @@ def get_dataset_dimension_axis(
         dataset_dimensions_with_dimension_type_query.session.connection(),
     ).groupby("dataset_id")
     for dataset_key in dims_grouped_by_dataset_df.groups.keys():
-        dataset_catalog_node = (
-            db.query(CatalogNode)
-            .filter(
-                and_(
-                    CatalogNode.dimension_id.is_(None),
-                    CatalogNode.dataset_id == dataset_key,
-                )
-            )
-            .one()
-        )
         dataset_dims_df = dims_grouped_by_dataset_df.get_group(dataset_key)
         # Rename dimension 'id' column in case of collision in naming where dimension type identifier is also named 'id'
         dataset_dims_df.rename(columns={"id": "dimension_id"}, inplace=True)
@@ -387,10 +376,6 @@ def get_dataset_dimension_axis(
             dimension_label = (
                 str(row["label"]) if row["label"] is not None else row["given_id"]
             )
-            new_dim_nodes = _create_dataset_dimension_catalog_nodes(
-                row, row["dimension_id"], dimension_label, dataset_catalog_node
-            )
-            new_dataset_dimension_catalog_nodes.extend(new_dim_nodes)
             if dimension_type.axis == "feature":
                 if (
                     dimension_label != row["feature_label"]

diff --git a/breadbox/breadbox/io/data_validation.py b/breadbox/breadbox/io/data_validation.py
@@ -241,7 +241,8 @@ def _get_dimension_type_given_ids(
     db: SessionWithUser, dimension_type: DimensionType
 ) -> list[str]:
     """
-    Get entity labels and aliases from the metadata for the given dimension type.
+    Get the full list of dimension type given IDs by reading the ID column 
+    of the dimension type metadata table. 
     """
     given_ids = (
         db.query(TabularCell)
@@ -258,22 +259,6 @@ def _get_dimension_type_given_ids(
     return [x for x, in given_ids]
 
 
-def _get_dataset_dimensions_labels(
-    dataset_dimensions: Union[pd.Series, pd.Index],
-    dimension_type_entity_labels_df: pd.DataFrame,
-) -> pd.DataFrame:
-    """
-    For dataset matrix features or samples identifiers, get their canonical names ('label') and aliases (if any) and return it as a dataframe indexed by their ids
-    """
-    dataset_dimensions_df: pd.DataFrame = dataset_dimensions.to_frame(name="given_id")
-    # NOTE: Merging at this stage means changes to dimension's label might not be reflected when stored in DatsetFeature or DatasetSample
-    dataset_dimensions_df = dataset_dimensions_df.merge(
-        dimension_type_entity_labels_df, "left", on="given_id"
-    )
-    dataset_dimensions_df.replace({np.nan: None}, inplace=True)
-    return dataset_dimensions_df
-
-
 @dataclass
 class DimensionLabelsAndWarnings:
     given_id_to_index: pd.DataFrame
@@ -282,8 +267,8 @@ class DimensionLabelsAndWarnings:
 
 @dataclass
 class DataframeValidatedFile:
-    feature_labels_and_aliases: pd.DataFrame
-    sample_labels_and_aliases: pd.DataFrame
+    feature_given_id_and_index: pd.DataFrame
+    sample_given_id_and_index: pd.DataFrame
     feature_warnings: List[str]
     sample_warnings: List[str]
 

diff --git a/breadbox/breadbox/schemas/dataset.py b/breadbox/breadbox/schemas/dataset.py
@@ -316,7 +316,6 @@ class UploadDatasetResponse(BaseModel):
     dataset: DatasetResponse
     warnings: List[str]
     forwardingUrl: Optional[str] = Field()
-    sliceId: Optional[str] = Field()
 
 
 class UploadDatasetResponseV2(BaseModel):

diff --git a/breadbox/pyright-ratchet-errors.txt b/breadbox/pyright-ratchet-errors.txt
@@ -39,10 +39,8 @@ types.py: error: Argument of type "Dict[str, AnnotationType] | None" cannot be a
 types.py: error: Argument of type "Dict[str, AnnotationType] | None" cannot be assigned to parameter "annotation_type_mapping" of type "Dict[str, AnnotationType]" in function "validate_dimension_type_metadata"
 types.py: error: Argument of type "Dict[str, Any] | None" cannot be assigned to parameter "reference_column_mappings" of type "Dict[str, str]" in function "check_id_mapping_is_valid"
 types.py: error: Argument of type "Json[IdMappingInsanity] | None" cannot be assigned to parameter "id_mapping" of type "Dict[str, Any] | None" in function "__init__" (reportArgumentType)
-types.py: error: Argument of type "Series | Unknown | ndarray[Any, Unknown] | Any | NDArray[Unknown]" cannot be assigned to parameter "dimension_id" of type "str" in function "_create_dataset_dimension_catalog_nodes"
 types.py: error: Argument of type "Text" cannot be assigned to parameter "axis" of type "Literal['feature', 'sample']" in function "__init__"
 types.py: error: Argument of type "str | None" cannot be assigned to parameter "file_name" of type "str" in function "update_dimension_type_metadata"
-types.py: error: Argument of type "str | Series | Unknown | ndarray[Any, Unknown] | Any | NDArray[Unknown]" cannot be assigned to parameter "dimension_label" of type "str" in function "_create_dataset_dimension_catalog_nodes"
 types.py: error: Cannot assign to attribute "dataset_id" for class "DimensionType"
 types.py: error: Expression of type "IdMapping" is incompatible with declared type "Json[IdMappingInsanity] | None"
 types.py: error: Expression of type "None" cannot be assigned to parameter of type "Dict[str, AnnotationType]"

diff --git a/breadbox/tests/api/test_datasets.py b/breadbox/tests/api/test_datasets.py
@@ -9,14 +9,17 @@
 
 from breadbox.db.session import SessionWithUser
 from breadbox.models.dataset import (
+    AnnotationType,
     CatalogNode,
     DatasetFeature,
     DatasetSample,
     Dataset,
+    Dimension,
+    MatrixDataset,
     TabularDataset,
     TabularColumn,
     TabularCell,
-    AnnotationType,
+    ValueType,
 )
 from fastapi.testclient import TestClient
 from breadbox.api.dependencies import get_dataset
@@ -1349,9 +1352,6 @@ def test_add_dataset(
         catalog_nodes = minimal_db.query(CatalogNode).all()
         assert len(feature_indexes) == 3  # Number of feaures should be 3
         assert len(sample_indexes) == 2  # Number of feaures should be 2
-        assert (
-            len(catalog_nodes) == 7
-        )  # Number of nodes is features + samples + dataset + root
 
     def test_add_dataset_no_write_access(
         self,
@@ -1450,22 +1450,15 @@ def test_add_categorical_dataset(
         result_dataset = r.json()["result"]["dataset"]
         feature_indexes = minimal_db.query(DatasetFeature).all()
         sample_indexes = minimal_db.query(DatasetSample).all()
-        catalog_nodes = minimal_db.query(CatalogNode).all()
-        dataset_node = (
-            minimal_db.query(CatalogNode)
-            .filter(
-                and_(
-                    CatalogNode.dataset_id == result_dataset["id"],
-                    CatalogNode.dimension_id.is_(None),
-                )
-            )
-            .one()
-        )
         assert len(feature_indexes) == 3  # Number of feaures should be 3
         assert len(sample_indexes) == 2  # Number of samples should be 2
-        assert len(catalog_nodes) == 7  # Number features + samples + root + dataset
-        assert dataset_node
-        assert dataset_node.is_categorical == True  # and dataset_node.is_binary == True
+        categorical_dataset = (
+            minimal_db.query(MatrixDataset)
+            .filter(MatrixDataset.id == result_dataset["id"])
+            .one()
+        )
+        assert categorical_dataset
+        assert categorical_dataset.value_type == ValueType.categorical
 
     def test_add_categorical_and_binary_dataset(
         self, client: TestClient, minimal_db, private_group: Dict, mock_celery
@@ -1782,12 +1775,12 @@ def test_add_dataset_with_missing_values(
         assert minimal_db.query(Dataset).filter_by(id=result_dataset["id"]).one()
         assert (
             len(
-                minimal_db.query(CatalogNode)
+                minimal_db.query(Dimension)
                 .filter_by(dataset_id=result_dataset["id"])
                 .all()
             )
-            == 6
-        )  # 3 features + 2 samples + 1 dataset node
+            == 5
+        )  # 3 features + 2 samples
 
         r_categorical = client.post(
             "/datasets/?allowed_values=Thing1&allowed_values=Thing2&allowed_values=Thing3",
@@ -1820,23 +1813,17 @@ def test_add_dataset_with_missing_values(
         assert minimal_db.query(Dataset).filter_by(id=result_dataset["id"]).one()
         assert (
             len(
-                minimal_db.query(CatalogNode)
+                minimal_db.query(Dimension)
                 .filter_by(dataset_id=result_dataset["id"])
                 .all()
             )
-            == 6
-        )  # 3 features + 2 samples + 1 dataset node
-        categorical_dataset_node = (
-            minimal_db.query(CatalogNode)
-            .filter(
-                and_(
-                    CatalogNode.dataset_id == result_dataset["id"],
-                    CatalogNode.dimension_id.is_(None),
-                )
-            )
+            == 5
+        )  # 3 features + 2 samples
+        categorical_dataset = (
+            minimal_db.query(MatrixDataset)
+            .filter(MatrixDataset.value_type == ValueType.categorical)
             .one()
         )
-        categorical_dataset = categorical_dataset_node.dataset
         feature_indices = [
             tup[0]
             for tup in minimal_db.query(DatasetFeature.index)