Merge branch 'master' into qa

broadinstitute · Jan 9, 2025 · ba4ea17 · ba4ea17
2 parents 949381f + cd53521
commit ba4ea17
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 49 deletions.
diff --git a/portal-backend/depmap/breadbox_shim/breadbox_shim.py b/portal-backend/depmap/breadbox_shim/breadbox_shim.py
@@ -5,22 +5,22 @@
 from breadbox_client.models.compute_response import ComputeResponse
 from breadbox_client.models import (
     MatrixDatasetResponse,
-    MatrixDatasetResponseFormat,
     TabularDatasetResponse,
     FeatureResponse,
     ValueType,
 )
-
 from depmap.data_access.response_parsing import (
     format_breadbox_task_status,
     get_breadbox_slice_id,
     parse_breadbox_slice_id,
+    remove_breadbox_prefix,
 )
 from depmap.interactive.config.categories import CustomCellLinesConfig
 from depmap.vector_catalog.models import Node, NodeType
 from depmap.vector_catalog.trees import Trees
 from depmap.partials.matrix.models import CellLineSeries
 from depmap import extensions
+from depmap_compute.slice import SliceQuery
 
 # Since breadbox and the legacy backend contain different datasets, we need to combine
 # values from each of their responses before returning a value.
@@ -56,7 +56,7 @@ def __init__(
 
 
 class BreadboxVectorCatalogNodeInfo:
-    """
+    """ 
     Vector Catalog endpoints return a specific dictionary structure for each parent node 
     in the vector catalog tree. This class reflects that same structure and contains
     some defaults specific to breadbox.
@@ -222,7 +222,7 @@ def _get_feature_node_info_with_siblings(
 def run_custom_analysis(
     analysis_type: str,
     dataset_slice_id: str,
-    query_feature_slice_id: str,
+    slice_query: Optional[SliceQuery],
     vector_variable_type: str,
     query_cell_lines: Optional[list[str]],
     query_values: Optional[list[Any]],
@@ -233,19 +233,33 @@ def run_custom_analysis(
     or a legacy portal feature (specified with the given feature_data).
     Return a task status.
     """
-    dataset_uuid = parse_breadbox_slice_id(dataset_slice_id).dataset_id
-    if query_feature_slice_id:
-        feature_id = parse_breadbox_slice_id(query_feature_slice_id).feature_id
-        query_dataset_id = parse_breadbox_slice_id(query_feature_slice_id).dataset_id
+    bb_dataset_id = remove_breadbox_prefix(dataset_slice_id)
+    if slice_query:
+        # Temporary hack: for now, this slice query ALWAYS specifies a feature by label.
+        # This should always be true for our current feature selection component.
+        # Soon, this "breadbox_shim" should be removed entirely, along with this hack.
+        assert slice_query.identifier_type == "feature_label"
+
+        # Hack part 2: Custom analysis in Breadbox was set up to take a feature's given ID.
+        # We have the label here and need to use that to load the given ID.
+        query_dataset_id = parse_breadbox_slice_id(slice_query.dataset_id).dataset_id
+        all_dataset_features = extensions.breadbox.client.get_dataset_features(
+            query_dataset_id
+        )
+        feature_id = None
+        for bb_feature in all_dataset_features:
+            if bb_feature["label"] == slice_query.identifier:
+                feature_id = bb_feature["id"]
         assert (
             feature_id is not None
-        ), "query_feature_slice_id must contain a feature ID"
+        ), f"Unexpected feature label passed to breadbox custom analysis: '{slice_query.identifier}'"
+
     else:
         feature_id = ""
         query_dataset_id = ""
     bb_task_status = extensions.breadbox.client.compute_univariate_associations(
         analysis_type=analysis_type,
-        dataset_id=dataset_uuid,
+        dataset_id=bb_dataset_id,
         query_feature_id=feature_id,
         query_dataset_id=query_dataset_id,
         vector_variable_type=vector_variable_type,

diff --git a/portal-backend/depmap/compute/views.py b/portal-backend/depmap/compute/views.py
@@ -15,6 +15,7 @@
 from depmap.extensions import csrf_protect, restplus_handle_exception
 from depmap.compute import analysis_tasks
 from depmap_compute.models import AnalysisType
+from depmap_compute.slice import slice_id_to_slice_query
 from depmap.user_uploads.utils.task_utils import get_current_result_dir
 
 blueprint = Blueprint(
@@ -113,35 +114,41 @@ def post(self):
         else:
             raise ValueError("Unexpected analysis type {}".format(analysis_type))
 
+        # Parse the slice ID if one was provided
+        if query_id:
+            slice_query = slice_id_to_slice_query(query_id)
+        else:
+            slice_query = None
+
         # Forward requests to breadbox a breadbox dataset is requested
         if dataset_id.startswith("breadbox/"):
-            # If the query feature is from a legacy dataset, load it now and pass the values to breadbox
+            # If the query slice is from a legacy dataset, load it now and pass the values to breadbox
             # The query_cell_lines parameter needs to be the same order/length as the query_values when passed to breadbox.
-            if query_id and not query_id.startswith("breadbox/"):
-                legacy_feature_series: pd.Series = interactive_utils.get_row_of_values_from_slice_id(
-                    query_id
-                )
+            if slice_query and not slice_query.dataset_id.startswith("breadbox/"):
+                legacy_data_slice = data_access.get_slice_data(slice_query)
                 if query_cell_lines is not None:
                     # When the cell lines have been filtered by the user,
                     # the legacy feature series also needs to be filtered before being passed to breadbox.
-                    feature_cell_lines = legacy_feature_series.index.tolist()
+                    feature_cell_lines = legacy_data_slice.index.tolist()
                     unordered_cell_lines_interesection = list(
                         set(query_cell_lines).intersection(set(feature_cell_lines))
                     )
                     if len(unordered_cell_lines_interesection) == 0:
                         return format_taskless_error_message(
                             "No cell lines in common between query and dataset searched"
                         )
-                    legacy_feature_series = legacy_feature_series.loc[
+                    legacy_data_slice = legacy_data_slice.loc[
                         unordered_cell_lines_interesection
                     ]
-                query_values = legacy_feature_series.tolist()
-                query_cell_lines = legacy_feature_series.index.tolist()
-                query_id = None
+                query_values = legacy_data_slice.tolist()
+                query_cell_lines: list[
+                    str
+                ] = legacy_data_slice.index.tolist()  # pyright: ignore
+                slice_query = None
             return breadbox_shim.run_custom_analysis(
                 analysis_type=analysis_type,
                 dataset_slice_id=dataset_id,
-                query_feature_slice_id=query_id,
+                slice_query=slice_query,
                 vector_variable_type=vector_variable_type,
                 query_cell_lines=query_cell_lines,
                 query_values=query_values,
@@ -164,12 +171,8 @@ def post(self):
             # 1. main query vector
             # 2. which is dependent/independent, the matrix or the vector
             # 3. optionally, a list of cell line depmap ids
-            if query_id.startswith("breadbox/"):
-                query_series = breadbox_shim.get_feature_data_slice(slice_id=query_id)
-            else:
-                query_series = interactive_utils.get_row_of_values_from_slice_id(
-                    query_id
-                )
+            assert slice_query is not None
+            query_series = data_access.get_slice_data(slice_query)
 
             # cl_query_vector is the intersection of cell lines in both data tracts plus the cell line subset
             (

diff --git a/portal-backend/depmap/data_access/response_parsing.py b/portal-backend/depmap/data_access/response_parsing.py
@@ -24,7 +24,7 @@ def parse_breadbox_slice_id(slice_id: str) -> ParsedBreadboxSliceId:
     """
     Parse the breadbox dataset ID and feature ID from the given slice ID. If the given 
     slice ID is malformed, throw a Bad Request error. Slice IDs should be formatted like 
-    'breadbox/<dataset-uuid>/<feature-uuid>' or 'breadbox/<dataset-uuid>'.
+    'breadbox/<dataset-uuid>/<feature-given-id>' or 'breadbox/<dataset-uuid>'.
     """
     match = re.match(BREADBOX_SLICE_ID_REGEX, slice_id)
     assert match, f"Breadbox slice id '{slice_id}' does not match the expected format."

diff --git a/portal-backend/pyright-ratchet-errors.txt b/portal-backend/pyright-ratchet-errors.txt
@@ -667,15 +667,13 @@ views.py: error: Argument of type "Literal['url']" cannot be assigned to paramet
 views.py: error: Argument of type "Literal['user_id']" cannot be assigned to parameter "__s" of type "slice" in function "__getitem__"
 views.py: error: Argument of type "Literal['value']" cannot be assigned to parameter "__s" of type "slice" in function "__getitem__"
 views.py: error: Argument of type "Literal[DependencyEnum.Avana]" cannot be assigned to parameter "dependency_dataset_name" of type "str" in function "get_dataset_by_name"
-views.py: error: Argument of type "Unknown | Any | None" cannot be assigned to parameter "query_feature_slice_id" of type "str" in function "run_custom_analysis"
 views.py: error: Argument of type "Unknown | FileStorage" cannot be assigned to parameter "filepath_or_buffer" of type "FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]" in function "read_csv" (reportArgumentType)
 views.py: error: Argument of type "Unknown | Hashable" cannot be assigned to parameter "group_name" of type "str" in function "__init__"
 views.py: error: Argument of type "Unknown | None" cannot be assigned to parameter "cell_line_col_index" of type "int" in function "get_all_cell_line_compound_sensitivity"
 views.py: error: Argument of type "Unknown | None" cannot be assigned to parameter "cell_line_col_index" of type "int" in function "get_all_cell_line_gene_effects"
 views.py: error: Argument of type "Unknown | int | list[Unknown]" cannot be assigned to parameter "color_num" of type "str | int" in function "__init__" (reportArgumentType)
 views.py: error: Argument of type "list[BreadboxVectorCatalogNodeInfo]" cannot be assigned to parameter "__iterable" of type "Iterable[dict[Unknown, Unknown]]" in function "extend"
 views.py: error: Argument of type "list[CellLineSeries]" cannot be assigned to parameter "breadbox_feature_data" of type "list[Series]" in function "get_df_from_feature_list"
-views.py: error: Argument of type "list[Unknown] | Any | list[int] | Unknown" cannot be assigned to parameter "query_cell_lines" of type "list[str] | None" in function "run_custom_analysis" (reportArgumentType)
 views.py: error: Argument of type "list[list[Unknown]] | None" cannot be assigned to parameter "compound_experiment_and_datasets" of type "List[Tuple[CompoundExperiment, DependencyDataset]]" in function "format_dep_dist_caption"
 views.py: error: Argument of type "str | Any" cannot be assigned to parameter "__s" of type "slice" in function "__getitem__"
 views.py: error: Argument of type "str | None" cannot be assigned to parameter "s" of type "str | bytes | bytearray" in function "loads" (reportArgumentType)

diff --git a/portal-backend/tests/depmap/compute/test_views.py b/portal-backend/tests/depmap/compute/test_views.py
@@ -376,31 +376,21 @@ def test_compute_univariate_associations_with_breadbox_feature(
     empty_db_mock_downloads.session.flush()
     interactive_test_utils.reload_interactive_config()
 
-    # Mock the breadbox client response
-    mock_breadbox_feature_data = [
-        FeatureResponse(
-            feature_id="foo",
-            dataset_id="bar",
-            values=FeatureResponseValues.from_dict(
-                {
-                    cell_lines[0].depmap_id: 0.1,
-                    cell_lines[1].depmap_id: 0.2,
-                    cell_lines[2].depmap_id: 0.3,
-                }
-            ),
-            label="feature_foo",
-            units="inches",
-            dataset_label="dataset_bar",
+    mock_breadbox_client.get_dataset_data = MagicMock(
+        return_value=pd.DataFrame(
+            data={"foo": [0.1, 0.2, 0.3]},
+            index=[
+                cell_lines[0].depmap_id,
+                cell_lines[1].depmap_id,
+                cell_lines[2].depmap_id,
+            ],
         )
-    ]
-    mock_breadbox_client.get_feature_data = MagicMock(
-        return_value=mock_breadbox_feature_data
     )
 
     with app.test_client() as c:
         # assemble query parameters
         dataset_id = gene_dataset.name.name
-        breadbox_slice_id = "breadbox/foo/bar"
+        breadbox_slice_id = "slice/breadbox%2Ffoo/feature_foo/label"
 
         parameters = {
             "analysisType": "pearson",