From 1ec80d0fb8f2133c7129683630b600ec4c413fb2 Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Fri, 29 Nov 2024 17:35:44 +0100 Subject: [PATCH 01/10] =?UTF-8?q?=F0=9F=9A=A7=20First=20draft?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + docs/arrays.ipynb | 2 +- docs/registries.ipynb | 79 +++++-- lamindb/_artifact.py | 7 +- lamindb/_query_set.py | 379 ++++++++++++++++++++++-------- lamindb/core/_context.py | 2 +- lamindb/core/datasets/__init__.py | 6 +- lamindb/core/datasets/_small.py | 40 +++- 8 files changed, 388 insertions(+), 128 deletions(-) diff --git a/.gitignore b/.gitignore index 9eaaf80bc..54f806162 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ lamin-tutorial/ mytest/ rds/ mydb/ +docs/test-registries/ docs/test-annotate-flexible/ docs/lamindb.* lamin_sphinx diff --git a/docs/arrays.ipynb b/docs/arrays.ipynb index 264abc656..ef2549314 100644 --- a/docs/arrays.ipynb +++ b/docs/arrays.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Query arrays" + "# Slice arrays" ] }, { diff --git a/docs/registries.ipynb b/docs/registries.ipynb index f97e6847d..68f9805a9 100644 --- a/docs/registries.ipynb +++ b/docs/registries.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "0a706415", + "metadata": {}, + "source": [ + "[![Jupyter Notebook](https://img.shields.io/badge/Source%20on%20GitHub-orange)](https://github.com/laminlabs/lamindb/blob/main/docs/registries.ipynb)" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -14,7 +22,7 @@ "id": "2079f103", "metadata": {}, "source": [ - "This guide walks through all the ways of finding metadata records in LaminDB registries." + "This guide walks through different ways of querying & searching LaminDB registries." ] }, { @@ -28,8 +36,8 @@ }, "outputs": [], "source": [ - "# !pip install lamindb\n", - "!lamin init --storage ./test-registries" + "# !pip install 'lamindb[bionty]'\n", + "!lamin init --storage ./test-registries --schema bionty" ] }, { @@ -37,7 +45,7 @@ "id": "86abfd19", "metadata": {}, "source": [ - "We'll need some toy data." + "The hidden cell below creates exemplary datasets and saves them in a LaminDB instance." ] }, { @@ -46,17 +54,61 @@ "id": "3ba53765", "metadata": { "tags": [ - "hide-output" + "hide-cell" ] }, "outputs": [], "source": [ "import lamindb as ln\n", + "import bionty as bt\n", + "from lamindb.core import datasets\n", "\n", - "# create toy data\n", - "ln.Artifact(ln.core.datasets.file_jpg_paradisi05(), description=\"My image\").save()\n", - "ln.Artifact.from_df(ln.core.datasets.df_iris(), description=\"The iris collection\").save()\n", - "ln.Artifact(ln.core.datasets.file_fastq(), description=\"My fastq\").save()\n", + "ln.track(\"pd7UR7Z8hoTq0000\")\n", + "\n", + "# Create non-curated datasets\n", + "ln.Artifact(datasets.file_jpg_paradisi05(), key=\"images/my_image.jpg\").save()\n", + "ln.Artifact(datasets.file_fastq(), key=\"raw/my_fastq.fastq\").save()\n", + "ln.Artifact.from_df(datasets.df_iris(), key=\"iris/iris_collection.parquet\").save()\n", + "\n", + "# Create a more complex case\n", + "# observation-level metadata\n", + "ln.Feature(name=\"cell_medium\", dtype=\"cat[ULabel]\").save()\n", + "ln.Feature(name=\"sample_note\", dtype=\"str\").save()\n", + "ln.Feature(name=\"cell_type_by_expert\", dtype=\"cat[bionty.CellType]\").save()\n", + "ln.Feature(name=\"cell_type_by_model\", dtype=\"cat[bionty.CellType]\").save()\n", + "# dataset-level metadata\n", + "ln.Feature(name=\"temperature\", dtype=\"float\").save()\n", + "ln.Feature(name=\"study\", dtype=\"cat[ULabel]\").save()\n", + "ln.Feature(name=\"date_of_study\", dtype=\"date\").save()\n", + "ln.Feature(name=\"study_note\", dtype=\"str\").save()\n", + "\n", + "## Permissible values for categoricals\n", + "ln.ULabel.from_values([\"DMSO\", \"IFNG\"], create=True).save()\n", + "ln.ULabel.from_values(\n", + " [\"Candidate marker study 1\", \"Candidate marker study 2\"], create=True\n", + ").save()\n", + "bt.CellType.from_values([\"B cell\", \"T cell\"], create=True).save()\n", + "\n", + "# Ingest dataset1\n", + "adata = datasets.small_dataset1(format=\"anndata\")\n", + "curator = ln.Curator.from_anndata(\n", + " adata,\n", + " var_index=bt.Gene.symbol,\n", + " categoricals={\n", + " \"cell_medium\": ln.ULabel.name,\n", + " \"cell_type_by_expert\": bt.CellType.name,\n", + " \"cell_type_by_model\": bt.CellType.name,\n", + " },\n", + " organism=\"human\",\n", + ")\n", + "artifact = curator.save_artifact(key=\"example_datasets/dataset1.h5ad\")\n", + "artifact.features.add_values(adata.uns)\n", + "\n", + "# Ingest dataset2\n", + "adata2 = datasets.small_dataset2(format=\"anndata\")\n", + "curator = ln.Curator.from_anndata(adata2, var_index=bt.Gene.symbol, categoricals={\"cell_medium\": ln.ULabel.name, \"cell_type_by_model\": bt.CellType.name}, organism=\"human\")\n", + "artifact2 = curator.save_artifact(key=\"example_datasets/dataset2.h5ad\")\n", + "artifact2.features.add_values(adata2.uns)\n", "\n", "# see the content of the artifact registry\n", "ln.Artifact.df()" @@ -619,7 +671,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "py310", "language": "python", "name": "python3" }, @@ -633,12 +685,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "vscode": { - "interpreter": { - "hash": "ae1fefc8646a06dd2e75004cd934adda7c5727b046986a772e3b44b0ffba9754" - } + "version": "3.10.15" } }, "nbformat": 4, diff --git a/lamindb/_artifact.py b/lamindb/_artifact.py index e48d3e755..1b56cc601 100644 --- a/lamindb/_artifact.py +++ b/lamindb/_artifact.py @@ -615,11 +615,14 @@ def __init__(artifact: Artifact, *args, **kwargs): init_self_from_db(artifact, kwargs_or_artifact) # adding "key" here is dangerous because key might be auto-populated - update_attributes(artifact, {"description": description}) - if artifact.key != key and key is not None: + attr_to_update = {"description": description} + if kwargs_or_artifact._key_is_virtual and kwargs_or_artifact.key is None: + attr_to_update["key"] = key + elif artifact.key != key and key is not None: logger.warning( f"key {artifact.key} on existing artifact differs from passed key {key}" ) + update_attributes(artifact, attr_to_update) return None else: kwargs = kwargs_or_artifact diff --git a/lamindb/_query_set.py b/lamindb/_query_set.py index 354debb4f..45dc386f1 100644 --- a/lamindb/_query_set.py +++ b/lamindb/_query_set.py @@ -1,20 +1,22 @@ from __future__ import annotations +import re from collections import UserList from collections.abc import Iterable from collections.abc import Iterable as IterableType -from dataclasses import is_dataclass from typing import TYPE_CHECKING, Any, Generic, NamedTuple, TypeVar import pandas as pd from django.db import models -from django.db.models import F -from lamin_utils import colors, logger +from django.db.models import F, ForeignKey, ManyToManyField +from django.db.models.fields.related import ForeignObjectRel +from lamin_utils import logger from lamindb_setup.core._docs import doc_args from lnschema_core.models import ( Artifact, CanCurate, Collection, + Feature, IsVersioned, Record, Registry, @@ -30,7 +32,7 @@ if TYPE_CHECKING: from collections.abc import Iterable - from lnschema_core.types import ListLike, StrField + from lnschema_core.types import StrField, listLike class MultipleResultsFound(Exception): @@ -184,6 +186,251 @@ def save(self) -> RecordList[T]: return self +def get_basic_field_names(qs: QuerySet) -> list[str]: + exclude_field_names = ["updated_at"] + field_names = [ + field.name + for field in qs.model._meta.fields + if ( + not isinstance(field, models.ForeignKey) + and field.name not in exclude_field_names + ) + ] + field_names += [ + f"{field.name}_id" + for field in qs.model._meta.fields + if isinstance(field, models.ForeignKey) + ] + for field_name in ["run_id", "created_at", "created_by_id", "updated_at"]: + if field_name in field_names: + field_names.remove(field_name) + field_names.append(field_name) + if field_names[0] != "uid" and "uid" in field_names: + field_names.remove("uid") + field_names.insert(0, "uid") + return field_names + + +def get_feature_annotate_kwargs(show_features: bool | list[str]) -> dict[str, Any]: + features = Feature.filter() + if isinstance(show_features, list): + features.filter(name__in=show_features) + # Get the categorical features + cat_feature_types = { + feature.dtype.replace("cat[", "").replace("]", "") + for feature in features + if feature.dtype.startswith("cat[") + } + # Get relationships of labels and features + link_models_on_models = { + getattr( + Artifact, obj.related_name + ).through.__get_name_with_schema__(): obj.related_model.__get_name_with_schema__() + for obj in Artifact._meta.related_objects + if obj.related_model.__get_name_with_schema__() in cat_feature_types + } + link_models_on_models["ArtifactULabel"] = "ULabel" + link_attributes_on_models = { + obj.related_name: link_models_on_models[ + obj.related_model.__get_name_with_schema__() + ] + for obj in Artifact._meta.related_objects + if obj.related_model.__get_name_with_schema__() in link_models_on_models + } + # Prepare Django's annotate for features + annotate_kwargs = {} + for link_attr, feature_type in link_attributes_on_models.items(): + annotate_kwargs[f"{link_attr}__feature__name"] = F( + f"{link_attr}__feature__name" + ) + field_name = ( + feature_type.split(".")[1] if "." in feature_type else feature_type + ).lower() + annotate_kwargs[f"{link_attr}__{field_name}__name"] = F( + f"{link_attr}__{field_name}__name" + ) + + annotate_kwargs["_feature_values__feature__name"] = F( + "_feature_values__feature__name" + ) + annotate_kwargs["_feature_values__value"] = F("_feature_values__value") + return annotate_kwargs + + +# https://claude.ai/share/16280046-6ae5-4f6a-99ac-dec01813dc3c +def analyze_lookup_cardinality( + model_class: Registry, lookup_paths: str | list[str] | None +) -> dict[str, str]: + """Analyze lookup cardinality. + + Analyzes Django model lookups to determine if they will result in + one-to-one or one-to-many relationships when used in annotations. + + Args: + model_class: The Django model class to analyze + include: List of lookup paths (e.g. ["created_by__name", "ulabels__name"]) + + Returns: + Dictionary mapping lookup paths to either 'one' or 'many' + """ + result = {} # type: ignore + if lookup_paths is None: + return result + elif isinstance(lookup_paths, str): + lookup_paths = [lookup_paths] + for lookup_path in lookup_paths: + parts = lookup_path.split("__") + current_model = model_class + is_many = False + + # Walk through each part of the lookup path + for part in parts[:-1]: # Exclude the last part as it's an attribute + field = None + + # Handle reverse relations + for f in current_model._meta.get_fields(): + if isinstance(f, ForeignObjectRel) and f.get_accessor_name() == part: + field = f + is_many = not f.one_to_one + if hasattr(f, "field"): + current_model = f.field.model + break + + # Handle forward relations + if field is None: + field = current_model._meta.get_field(part) + if isinstance(field, ManyToManyField): + is_many = True + current_model = field.remote_field.model + elif isinstance(field, ForeignKey): + current_model = field.remote_field.model + + result[lookup_path] = "many" if is_many else "one" + + return result + + +# https://lamin.ai/laminlabs/lamindata/transform/BblTiuKxsb2g0003 +# https://claude.ai/chat/6ea2498c-944d-4e7a-af08-29e5ddf637d2 +def reshape_annotate_result( + field_names: list[str], + df: pd.DataFrame, + extra_columns: dict[str, str] | None = None, + features: bool | list[str] = False, +) -> pd.DataFrame: + """Reshapes experimental data with optional feature handling. + + Parameters: + field_names: List of basic fields to include in result + df: Input dataframe with experimental data + extra_columns: Dict specifying additional columns to process with types ('one' or 'many') + e.g., {'ulabels__name': 'many', 'created_by__name': 'one'} + features: If False, skip feature processing. If True, process all features. + If list of strings, only process specified features. + + Returns: + DataFrame with reshaped data + """ + extra_columns = extra_columns or {} + + # Initialize result with basic fields + result = df[field_names].drop_duplicates(subset=["id"]) + + # Process features if requested + if features: + # Handle _feature_values if columns exist + feature_cols = ["_feature_values__feature__name", "_feature_values__value"] + if all(col in df.columns for col in feature_cols): + feature_values = process_feature_values(df, features) + if not feature_values.empty: + result = result.merge(feature_values, on="id", how="left") + + # Handle links features if they exist + links_features = [ + col + for col in df.columns + if "feature__name" in col and col.startswith("links_") + ] + + if links_features: + result = process_links_features(df, result, links_features, features) + + # Process extra columns + if extra_columns: + result = process_extra_columns(df, result, extra_columns) + + return result + + +def process_feature_values( + df: pd.DataFrame, features: bool | list[str] +) -> pd.DataFrame: + """Process _feature_values columns.""" + feature_values = df.groupby(["id", "_feature_values__feature__name"])[ + "_feature_values__value" + ].agg(set) + + # Filter features if specific ones requested + if isinstance(features, list): + feature_values = feature_values[ + feature_values.index.get_level_values( + "_feature_values__feature__name" + ).isin(features) + ] + + return feature_values.unstack().reset_index() + + +def process_links_features( + df: pd.DataFrame, + result: pd.DataFrame, + feature_cols: list[str], + features: bool | list[str], +) -> pd.DataFrame: + """Process links_XXX feature columns.""" + for feature_col in feature_cols: + prefix = re.match(r"links_(.+?)__feature__name", feature_col).group(1) + + value_cols = [ + col + for col in df.columns + if col.startswith(f"links_{prefix}__") + and col.endswith("__name") + and "feature__name" not in col + ] + + if not value_cols: + continue + + value_col = value_cols[0] + feature_names = df[feature_col].unique() + + # Filter features if specific ones requested + if isinstance(features, list): + feature_names = [f for f in feature_names if f in features] + + for feature_name in feature_names: + mask = df[feature_col] == feature_name + feature_values = df[mask].groupby("id")[value_col].agg(set) + result[feature_name] = result["id"].map(feature_values) + + return result + + +def process_extra_columns( + df: pd.DataFrame, result: pd.DataFrame, extra_columns: dict[str, str] +) -> pd.DataFrame: + """Process additional columns based on their specified types.""" + for col, col_type in extra_columns.items(): + if col not in df.columns: + continue + + values = df.groupby("id")[col].agg(set if col_type == "many" else "first") + result[col] = result["id"].map(values) + + return result + + class QuerySet(models.QuerySet): """Sets of records returned by queries. @@ -193,108 +440,38 @@ class QuerySet(models.QuerySet): Examples: - >>> ln.ULabel(name="my label").save() - >>> queryset = ln.ULabel.filter(name="my label") + >>> ULabel(name="my label").save() + >>> queryset = ULabel.filter(name="my label") >>> queryset """ @doc_args(Record.df.__doc__) def df( - self, include: str | list[str] | None = None, join: str = "inner" + self, + include: str | list[str] | None = None, + features: bool | list[str] = False, + join: str = "inner", ) -> pd.DataFrame: """{}""" # noqa: D415 - # re-order the columns - exclude_field_names = ["updated_at"] - field_names = [ - field.name - for field in self.model._meta.fields - if ( - not isinstance(field, models.ForeignKey) - and field.name not in exclude_field_names - ) - ] - field_names += [ - f"{field.name}_id" - for field in self.model._meta.fields - if isinstance(field, models.ForeignKey) - ] - for field_name in ["run_id", "created_at", "created_by_id", "updated_at"]: - if field_name in field_names: - field_names.remove(field_name) - field_names.append(field_name) - if field_names[0] != "uid" and "uid" in field_names: - field_names.remove("uid") - field_names.insert(0, "uid") - # create the dataframe - df = pd.DataFrame(self.values(), columns=field_names) - # if len(df) > 0 and "updated_at" in df: - # df.updated_at = format_and_convert_to_local_time(df.updated_at) - # if len(df) > 0 and "started_at" in df: - # df.started_at = format_and_convert_to_local_time(df.started_at) + field_names = get_basic_field_names(self) + annotate_kwargs = {} + if features: + annotate_kwargs.update(get_feature_annotate_kwargs(features)) + if include: + include_kwargs = {s: F(s) for s in include} + annotate_kwargs.update(include_kwargs) + if annotate_kwargs: + queryset = self.annotate(**annotate_kwargs).distinct() + else: + queryset = self + df = pd.DataFrame(queryset.values(*field_names, *list(annotate_kwargs.keys()))) + extra_cols = analyze_lookup_cardinality(self.model.__class__, include) + df_reshaped = reshape_annotate_result(field_names, df, extra_cols, features) pk_name = self.model._meta.pk.name pk_column_name = pk_name if pk_name in df.columns else f"{pk_name}_id" - if pk_column_name in df.columns: - df = df.set_index(pk_column_name) - if len(df) == 0: - logger.warning(colors.yellow("No records found")) - return df - if include is not None: - if isinstance(include, str): - include = [include] - # fix ordering - include = include[::-1] - for expression in include: - split = expression.split("__") - field_name = split[0] - if len(split) > 1: - lookup_str = "__".join(split[1:]) - else: - lookup_str = "id" - Record = self.model - field = getattr(Record, field_name) - if isinstance(field.field, models.ManyToManyField): - related_ORM = ( - field.field.model - if field.field.model != Record - else field.field.related_model - ) - if Record == related_ORM: - left_side_link_model = f"from_{Record.__name__.lower()}" - values_expression = ( - f"to_{Record.__name__.lower()}__{lookup_str}" - ) - else: - left_side_link_model = f"{Record.__name__.lower()}" - values_expression = ( - f"{related_ORM.__name__.lower()}__{lookup_str}" - ) - link_df = pd.DataFrame( - field.through.objects.using(self.db).values( - left_side_link_model, values_expression - ) - ) - if link_df.shape[0] == 0: - logger.warning( - f"{colors.yellow(expression)} is not shown because no values are found" - ) - continue - link_groupby = link_df.groupby(left_side_link_model)[ - values_expression - ].apply(list) - df = pd.concat((link_groupby, df), axis=1, join=join) - df.rename(columns={values_expression: expression}, inplace=True) - else: - # the F() based implementation could also work for many-to-many, - # would need to test what is faster - df_anno = pd.DataFrame( - self.annotate(expression=F(expression)).values( - pk_column_name, "expression" - ) - ) - df_anno = df_anno.set_index(pk_column_name) - df_anno.rename(columns={"expression": expression}, inplace=True) - df = pd.concat((df_anno, df), axis=1, join=join) - return df + if pk_column_name in df_reshaped.columns: + df_reshaped = df_reshaped.set_index(pk_column_name) + return df_reshaped def delete(self, *args, **kwargs): """Delete all records in the query set.""" @@ -348,8 +525,8 @@ def one_or_none(self) -> Record | None: """At most one result. Returns it if there is one, otherwise returns ``None``. Examples: - >>> ln.ULabel.filter(name="benchmark").one_or_none() - >>> ln.ULabel.filter(name="non existing label").one_or_none() + >>> ULabel.filter(name="benchmark").one_or_none() + >>> ULabel.filter(name="non existing label").one_or_none() """ if len(self) == 0: return None @@ -388,7 +565,7 @@ def lookup(self, field: StrField | None = None, **kwargs) -> NamedTuple: @doc_args(CanCurate.validate.__doc__) -def validate(self, values: ListLike, field: str | StrField | None = None, **kwargs): +def validate(self, values: listLike, field: str | StrField | None = None, **kwargs): """{}""" # noqa: D415 from ._can_curate import _validate @@ -396,7 +573,7 @@ def validate(self, values: ListLike, field: str | StrField | None = None, **kwar @doc_args(CanCurate.inspect.__doc__) -def inspect(self, values: ListLike, field: str | StrField | None = None, **kwargs): +def inspect(self, values: listLike, field: str | StrField | None = None, **kwargs): """{}""" # noqa: D415 from ._can_curate import _inspect diff --git a/lamindb/core/_context.py b/lamindb/core/_context.py index 465566b49..892acf533 100644 --- a/lamindb/core/_context.py +++ b/lamindb/core/_context.py @@ -433,7 +433,7 @@ def _track_notebook( nb = nbproject.dev.read_notebook(path_str) self._logging_message_imports += ( "notebook imports:" - f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}\n" + f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}" ) except Exception: logger.debug("inferring imported packages failed") diff --git a/lamindb/core/datasets/__init__.py b/lamindb/core/datasets/__init__.py index 3f6c61bc3..1eda58821 100644 --- a/lamindb/core/datasets/__init__.py +++ b/lamindb/core/datasets/__init__.py @@ -6,6 +6,7 @@ :toctree: . small_dataset1 + small_dataset2 anndata_with_obs Files. @@ -84,7 +85,4 @@ schmidt22_perturbseq, ) from ._fake import fake_bio_notebook_titles -from ._small import ( - anndata_with_obs, - small_dataset1, -) +from ._small import anndata_with_obs, small_dataset1, small_dataset2 diff --git a/lamindb/core/datasets/_small.py b/lamindb/core/datasets/_small.py index 6b490aa27..62a2e25f5 100644 --- a/lamindb/core/datasets/_small.py +++ b/lamindb/core/datasets/_small.py @@ -9,14 +9,16 @@ def small_dataset1( format: Literal["df", "anndata"], + with_typo: bool = False, ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData: # define the data in the dataset # it's a mix of numerical measurements and observation-level metadata + ifng = "IFNJ" if with_typo else "IFNG" dataset_dict = { "CD8A": [1, 2, 3], "CD4": [3, 4, 5], "CD14": [5, 6, 7], - "cell_medium": ["DMSO", "IFNG", "DMSO"], + "cell_medium": ["DMSO", ifng, "DMSO"], "sample_note": ["was ok", "looks naah", "pretty! 🤩"], "cell_type_by_expert": ["B cell", "T cell", "T cell"], "cell_type_by_model": ["B cell", "T cell", "T cell"], @@ -30,12 +32,44 @@ def small_dataset1( } # the dataset as DataFrame dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"]) - dataset_ad = ad.AnnData( - dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata + if format == "df": + return dataset_df, metadata + else: + dataset_ad = ad.AnnData( + dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata + ) + return dataset_ad + + +def small_dataset2( + format: Literal["df", "anndata"], +) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData: + dataset_dict = { + "CD8A": [2, 3, 3], + "CD4": [3, 4, 5], + "CD38": [4, 2, 3], + "cell_medium": ["DMSO", "IFNG", "IFNG"], + "cell_type_by_model": ["B cell", "T cell", "T cell"], + } + metadata = { + "temperature": 22.6, + "study": "Candidate marker study 2", + "date_of_study": "2025-02-13", + } + dataset_df = pd.DataFrame( + dataset_dict, + index=["sample4", "sample5", "sample6"], + ) + ad.AnnData( + dataset_df[["CD8A", "CD4", "CD38"]], + obs=dataset_df[["cell_medium", "cell_type_by_model"]], ) if format == "df": return dataset_df, metadata else: + dataset_ad = ad.AnnData( + dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata + ) return dataset_ad From c7ace67a0e4a2dd35afedf9744e7481048067577 Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Fri, 29 Nov 2024 21:00:07 +0100 Subject: [PATCH 02/10] =?UTF-8?q?=E2=9C=A8=20Add=20a=20new=20section=20to?= =?UTF-8?q?=20the=20registries=20guide?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/registries.ipynb | 99 +++++++++++++++++++++++++++++------- lamindb/_feature.py | 2 + lamindb/_query_set.py | 37 ++++++++++---- lamindb/_record.py | 3 +- lamindb/_view.py | 114 ++++++++++++++++++++++++++++++++++++++---- sub/lnschema-core | 2 +- 6 files changed, 219 insertions(+), 38 deletions(-) diff --git a/docs/registries.ipynb b/docs/registries.ipynb index 68f9805a9..33d74b1b5 100644 --- a/docs/registries.ipynb +++ b/docs/registries.ipynb @@ -25,27 +25,12 @@ "This guide walks through different ways of querying & searching LaminDB registries." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "03242699", - "metadata": { - "tags": [ - "hide-output" - ] - }, - "outputs": [], - "source": [ - "# !pip install 'lamindb[bionty]'\n", - "!lamin init --storage ./test-registries --schema bionty" - ] - }, { "cell_type": "markdown", "id": "86abfd19", "metadata": {}, "source": [ - "The hidden cell below creates exemplary datasets and saves them in a LaminDB instance." + "Let's start by creating a few exemplary datasets and saving them into a LaminDB instance (hidden cell)." ] }, { @@ -59,6 +44,10 @@ }, "outputs": [], "source": [ + "# !pip install 'lamindb[bionty]'\n", + "!lamin init --storage ./test-registries --schema bionty\n", + "\n", + "# python\n", "import lamindb as ln\n", "import bionty as bt\n", "from lamindb.core import datasets\n", @@ -108,12 +97,86 @@ "adata2 = datasets.small_dataset2(format=\"anndata\")\n", "curator = ln.Curator.from_anndata(adata2, var_index=bt.Gene.symbol, categoricals={\"cell_medium\": ln.ULabel.name, \"cell_type_by_model\": bt.CellType.name}, organism=\"human\")\n", "artifact2 = curator.save_artifact(key=\"example_datasets/dataset2.h5ad\")\n", - "artifact2.features.add_values(adata2.uns)\n", + "artifact2.features.add_values(adata2.uns)" + ] + }, + { + "cell_type": "markdown", + "id": "36b8d44a", + "metadata": {}, + "source": [ + "## Get an overview" + ] + }, + { + "cell_type": "markdown", + "id": "c6410d93", + "metadata": {}, + "source": [ + "The easiest way to get an overview over all datasets is by typing {meth}`~lamindb.Artifact.df`, which returns the 100 latest artifacts in the {class}`~lamindb.Artifact` registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4413cc02", + "metadata": { + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "import lamindb as ln\n", "\n", - "# see the content of the artifact registry\n", "ln.Artifact.df()" ] }, + { + "cell_type": "markdown", + "id": "682d8295", + "metadata": {}, + "source": [ + "To join values from other registries into this overview, pass them via the `include` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "462284cc", + "metadata": { + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "ln.Artifact.df(include=[\"created_by__name\", \"ulabels__name\", \"cell_types__name\", \"feature_sets__registry\"])" + ] + }, + { + "cell_type": "markdown", + "id": "aa3954d7", + "metadata": {}, + "source": [ + "If you'd like to see which artifacts measure which features, use parameter `features`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "061047fb", + "metadata": { + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "df = ln.Artifact.df(features=True)\n", + "ln.view(df) # for clarity, we visualize this with type annotations" + ] + }, { "attachments": {}, "cell_type": "markdown", diff --git a/lamindb/_feature.py b/lamindb/_feature.py index 88adc64c9..353a9774f 100644 --- a/lamindb/_feature.py +++ b/lamindb/_feature.py @@ -57,6 +57,8 @@ def convert_pandas_dtype_to_lamin_dtype(pandas_dtype: ExtensionDtype) -> str: else: # strip precision qualifiers dtype = "".join(dt for dt in pandas_dtype.name if not dt.isdigit()) + if dtype.startswith("datetime"): + dtype = dtype.split("[")[0] assert dtype in FEATURE_DTYPES # noqa: S101 return dtype diff --git a/lamindb/_query_set.py b/lamindb/_query_set.py index 45dc386f1..2bfd57fe5 100644 --- a/lamindb/_query_set.py +++ b/lamindb/_query_set.py @@ -39,6 +39,9 @@ class MultipleResultsFound(Exception): pass +pd.set_option("display.max_columns", 200) + + # def format_and_convert_to_local_time(series: pd.Series): # tzinfo = datetime.now().astimezone().tzinfo # timedelta = tzinfo.utcoffset(datetime.now()) # type: ignore @@ -201,7 +204,14 @@ def get_basic_field_names(qs: QuerySet) -> list[str]: for field in qs.model._meta.fields if isinstance(field, models.ForeignKey) ] - for field_name in ["run_id", "created_at", "created_by_id", "updated_at"]: + for field_name in [ + "version", + "is_latest", + "run_id", + "created_at", + "created_by_id", + "updated_at", + ]: if field_name in field_names: field_names.remove(field_name) field_names.append(field_name) @@ -259,7 +269,7 @@ def get_feature_annotate_kwargs(show_features: bool | list[str]) -> dict[str, An # https://claude.ai/share/16280046-6ae5-4f6a-99ac-dec01813dc3c def analyze_lookup_cardinality( - model_class: Registry, lookup_paths: str | list[str] | None + model_class: Record, lookup_paths: list[str] | None ) -> dict[str, str]: """Analyze lookup cardinality. @@ -276,8 +286,6 @@ def analyze_lookup_cardinality( result = {} # type: ignore if lookup_paths is None: return result - elif isinstance(lookup_paths, str): - lookup_paths = [lookup_paths] for lookup_path in lookup_paths: parts = lookup_path.split("__") current_model = model_class @@ -343,7 +351,10 @@ def reshape_annotate_result( if all(col in df.columns for col in feature_cols): feature_values = process_feature_values(df, features) if not feature_values.empty: - result = result.merge(feature_values, on="id", how="left") + for col in feature_values.columns: + if col in result.columns: + continue + result.insert(3, col, feature_values[col]) # Handle links features if they exist links_features = [ @@ -388,6 +399,7 @@ def process_links_features( features: bool | list[str], ) -> pd.DataFrame: """Process links_XXX feature columns.""" + # this loops over different entities that might be linked under a feature for feature_col in feature_cols: prefix = re.match(r"links_(.+?)__feature__name", feature_col).group(1) @@ -404,6 +416,7 @@ def process_links_features( value_col = value_cols[0] feature_names = df[feature_col].unique() + feature_names = feature_names[~pd.isna(feature_names)] # Filter features if specific ones requested if isinstance(features, list): @@ -412,7 +425,7 @@ def process_links_features( for feature_name in feature_names: mask = df[feature_col] == feature_name feature_values = df[mask].groupby("id")[value_col].agg(set) - result[feature_name] = result["id"].map(feature_values) + result.insert(3, feature_name, result["id"].map(feature_values)) return result @@ -426,7 +439,7 @@ def process_extra_columns( continue values = df.groupby("id")[col].agg(set if col_type == "many" else "first") - result[col] = result["id"].map(values) + result.insert(0, col, result["id"].map(values)) return result @@ -458,14 +471,20 @@ def df( if features: annotate_kwargs.update(get_feature_annotate_kwargs(features)) if include: + if isinstance(include, str): + include = [include] + include = include.copy()[::-1] include_kwargs = {s: F(s) for s in include} annotate_kwargs.update(include_kwargs) if annotate_kwargs: - queryset = self.annotate(**annotate_kwargs).distinct() + queryset = self.annotate(**annotate_kwargs) else: queryset = self df = pd.DataFrame(queryset.values(*field_names, *list(annotate_kwargs.keys()))) - extra_cols = analyze_lookup_cardinality(self.model.__class__, include) + if len(df) == 0: + df = pd.DataFrame({}, columns=field_names) + return df + extra_cols = analyze_lookup_cardinality(self.model, include) # type: ignore df_reshaped = reshape_annotate_result(field_names, df, extra_cols, features) pk_name = self.model._meta.pk.name pk_column_name = pk_name if pk_name in df.columns else f"{pk_name}_id" diff --git a/lamindb/_record.py b/lamindb/_record.py index ec5c219c9..2331966fc 100644 --- a/lamindb/_record.py +++ b/lamindb/_record.py @@ -264,6 +264,7 @@ def get( def df( cls, include: str | list[str] | None = None, + features: bool | list[str] = False, join: str = "inner", limit: int = 100, ) -> pd.DataFrame: @@ -271,7 +272,7 @@ def df( query_set = cls.filter() if hasattr(cls, "updated_at"): query_set = query_set.order_by("-updated_at") - return query_set[:limit].df(include=include, join=join) + return query_set[:limit].df(include=include, features=features, join=join) def _search( diff --git a/lamindb/_view.py b/lamindb/_view.py index 6a781cd95..fa82bf9e0 100644 --- a/lamindb/_view.py +++ b/lamindb/_view.py @@ -3,22 +3,107 @@ import builtins import importlib import inspect +from typing import TYPE_CHECKING +from IPython.display import HTML, display from lamin_utils import colors, logger from lamindb_setup import settings from lamindb_setup._init_instance import get_schema_module_name -from lnschema_core import Record +from lnschema_core import Feature, Record + +from lamindb.core import FeatureValue, ParamValue + +from ._feature import convert_pandas_dtype_to_lamin_dtype + +if TYPE_CHECKING: + import pandas as pd is_run_from_ipython = getattr(builtins, "__IPYTHON__", False) +def display_df_with_descriptions( + df: pd.DataFrame, descriptions: dict[str, str] | None = None +): + if descriptions is None: + display(df) + return None + + # Start building HTML table + html = '' + + # Create header with title and description rows + html += "" + + # Column names row + html += "" + html += '' # Index header + for col in df.columns: + html += f'' + html += "" + + # Descriptions row + html += "" + html += f'' # Index column + for col in df.columns: + desc = descriptions.get(col, "") + html += f'' + html += "" + + html += "" + + # Add body rows + html += "" + for idx, row in df.iterrows(): + html += "" + html += f'' # Index value + for col in df.columns: + html += f"" + html += "" + html += "" + html += "
{col}
{df.index.name or ""}{desc}
{idx}{row[col]}
" + + # Add CSS styles + styled_html = f""" + + {html} + """ + return display(HTML(styled_html)) + + def view( - n: int = 7, schema: str | None = None, registries: list[str] | None = None + df: pd.DataFrame | None = None, + limit: int = 7, + schema: str | None = None, + registries: list[str] | None = None, ) -> None: - """View latest metadata state. + """View metadata. Args: - n: Display the last `n` rows of a registry. + df: A DataFrame to display. + limit: Display the latest `n` records schema: Schema module to view. Default's to `None` and displays all schema modules. registries: List of Record names. Defaults to @@ -27,6 +112,16 @@ def view( Examples: >>> ln.view() """ + if df is not None: + descriptions = { + col_name: convert_pandas_dtype_to_lamin_dtype(dtype) + for col_name, dtype in df.dtypes.to_dict().items() + } + feature_dtypes = dict(Feature.objects.values_list("name", "dtype")) + descriptions.update(feature_dtypes) + display_df_with_descriptions(df, descriptions) + return None + if is_run_from_ipython: from IPython.display import display as show else: @@ -39,6 +134,9 @@ def view( for schema_name in schema_names: schema_module = importlib.import_module(get_schema_module_name(schema_name)) + # the below is necessary because a schema module might not have been + # explicitly accessed + importlib.reload(schema_module) all_registries = { registry @@ -47,6 +145,8 @@ def view( and issubclass(registry, Record) and registry is not Record } + if schema_name == "core": + all_registries.update({FeatureValue, ParamValue}) if registries is not None: filtered_registries = { registry @@ -62,11 +162,7 @@ def view( logger.print(section) logger.print("*" * len(section_no_color)) for registry in sorted(filtered_registries, key=lambda x: x.__name__): - if hasattr(registry, "updated_at"): - df = registry.filter().order_by("-updated_at")[:n].df() - else: - # need to adjust in the future - df = registry.df().iloc[-n:] + df = registry.df(limit=limit) if df.shape[0] > 0: logger.print(colors.blue(colors.bold(registry.__name__))) show(df) diff --git a/sub/lnschema-core b/sub/lnschema-core index 04b4421aa..2eb23a663 160000 --- a/sub/lnschema-core +++ b/sub/lnschema-core @@ -1 +1 @@ -Subproject commit 04b4421aa2ddfe43a79381123463236569aee308 +Subproject commit 2eb23a6639098a1dd660eabecdf0b207194be949 From 41f8f82d3105696d3f80b9a5adbd45b470690b3a Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Fri, 29 Nov 2024 21:21:14 +0100 Subject: [PATCH 03/10] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Prettify=20guide?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/registries.ipynb | 77 +++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/docs/registries.ipynb b/docs/registries.ipynb index 33d74b1b5..dcdd95831 100644 --- a/docs/registries.ipynb +++ b/docs/registries.ipynb @@ -208,49 +208,36 @@ }, "outputs": [], "source": [ - "# query the database for all users, optionally pass the field that creates the key\n", - "users = ln.User.lookup(field=\"handle\")\n", - "\n", - "# the lookup object is a NamedTuple\n", - "users" + "# query the database for all ulabels or all cell types\n", + "ulabels = ln.ULabel.lookup()\n", + "cell_types = bt.CellType.lookup()" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "82a31938", + "id": "24926aa1", "metadata": {}, "source": [ - "With auto-complete, we find a specific user record:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31853b49", - "metadata": { - "tags": [ - "hide-output" - ] - }, - "outputs": [], - "source": [ - "user = users.testuser1\n", - "user" + ":::{dropdown} Show me a screenshot\n", + "\n", + "\n", + "\n", + ":::" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "bf7e1415", + "id": "82a31938", "metadata": {}, "source": [ - "You can also get a dictionary:" + "With auto-complete, we find a ulabel:" ] }, { "cell_type": "code", "execution_count": null, - "id": "04130d47", + "id": "31853b49", "metadata": { "tags": [ "hide-output" @@ -258,8 +245,8 @@ }, "outputs": [], "source": [ - "users_dict = ln.User.lookup().dict()\n", - "users_dict" + "study1 = ulabels.candidate_marker_study_1\n", + "study1" ] }, { @@ -289,11 +276,13 @@ }, "outputs": [], "source": [ - "# by the universal base62 uid\n", - "ln.User.get(\"DzTjkKse\")\n", + "print(study.uid)\n", + "\n", + "# by uid\n", + "ln.ULabel.get(study1.uid)\n", "\n", - "# by any expression involving fields\n", - "ln.User.get(handle=\"testuser1\")" + "# by field\n", + "ln.ULabel.get(name=\"Canidate marker study 1\")" ] }, { @@ -311,7 +300,7 @@ "id": "02b75183", "metadata": {}, "source": [ - "Filter for all artifacts created by a user:" + "Filter for all artifacts annotated by a ulabel:" ] }, { @@ -325,7 +314,7 @@ }, "outputs": [], "source": [ - "ln.Artifact.filter(created_by=user).df()" + "ln.Artifact.filter(ulabels=user).df()" ] }, { @@ -336,10 +325,10 @@ "source": [ "To access the results encoded in a filter statement, execute its return value with one of:\n", "\n", - "- `.df()`: A pandas `DataFrame` with each record in a row.\n", - "- `.all()`: A {class}`~lamindb.core.QuerySet`.\n", - "- `.one()`: Exactly one record. Will raise an error if there is none. Is equivalent to the `.get()` method shown above.\n", - "- `.one_or_none()`: Either one record or `None` if there is no query result." + "- {meth}`~lamindb.core.QuerySet.df`: A pandas `DataFrame` with each record in a row.\n", + "- {meth}`~lamindb.core.QuerySet.all`: A {class}`~lamindb.core.QuerySet`.\n", + "- {meth}`~lamindb.core.QuerySet.one`: Exactly one record. Will raise an error if there is none. Is equivalent to the `.get()` method shown above.\n", + "- {meth}`~lamindb.core.QuerySet.one_or_one`: Either one record or `None` if there is no query result." ] }, { @@ -351,11 +340,19 @@ "\n", "{meth}`~lamindb.core.Record.filter` returns a {class}`~lamindb.core.QuerySet`.\n", "\n", - "The ORMs in LaminDB are Django Models and any [Django query](https://docs.djangoproject.com/en/stable/topics/db/queries/) works. LaminDB extends Django's API for data scientists.\n", + "The registries in LaminDB are Django Models and any [Django query](https://docs.djangoproject.com/en/stable/topics/db/queries/) works.\n", + "\n", + "LaminDB re-interprets Django's API for data scientists.\n", + "\n", + "```\n", + "\n", + "```{dropdown} What does this have to do with SQL?\n", "\n", "Under the hood, any `.filter()` call translates into a SQL select statement.\n", "\n", - "`.one()` and `.one_or_none()` are two parts of LaminDB's API that are borrowed from SQLAlchemy.\n", + "LaminDB's registries are object relational mappers (ORMs) that rely on Django for all the heavy lifting.\n", + "\n", + "Of note, `.one()` and `.one_or_none()` are the two parts of LaminDB's API that are borrowed from SQLAlchemy. In its first year, LaminDB built on SQLAlchemy.\n", "\n", "```" ] From 3a4ea303a0179face043934d912ce07b5a2d0fdd Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Fri, 29 Nov 2024 22:41:19 +0100 Subject: [PATCH 04/10] =?UTF-8?q?=F0=9F=92=9A=20Improve=20&=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/registries.ipynb | 18 +++++++++++++ lamindb/_query_set.py | 1 - lamindb/_record.py | 3 +-- lamindb/_save.py | 1 + sub/lnschema-core | 2 +- tests/core/test_queryset.py | 53 +++++++++++++------------------------ 6 files changed, 40 insertions(+), 38 deletions(-) diff --git a/docs/registries.ipynb b/docs/registries.ipynb index dcdd95831..bcc670734 100644 --- a/docs/registries.ipynb +++ b/docs/registries.ipynb @@ -177,6 +177,24 @@ "ln.view(df) # for clarity, we visualize this with type annotations" ] }, + { + "cell_type": "markdown", + "id": "82f563e0", + "metadata": {}, + "source": [ + "Compare this with the underlying normalized view of the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9c04f85", + "metadata": {}, + "outputs": [], + "source": [ + "ln.view()" + ] + }, { "attachments": {}, "cell_type": "markdown", diff --git a/lamindb/_query_set.py b/lamindb/_query_set.py index 2bfd57fe5..0ec86c499 100644 --- a/lamindb/_query_set.py +++ b/lamindb/_query_set.py @@ -463,7 +463,6 @@ def df( self, include: str | list[str] | None = None, features: bool | list[str] = False, - join: str = "inner", ) -> pd.DataFrame: """{}""" # noqa: D415 field_names = get_basic_field_names(self) diff --git a/lamindb/_record.py b/lamindb/_record.py index 2331966fc..76307f064 100644 --- a/lamindb/_record.py +++ b/lamindb/_record.py @@ -265,14 +265,13 @@ def df( cls, include: str | list[str] | None = None, features: bool | list[str] = False, - join: str = "inner", limit: int = 100, ) -> pd.DataFrame: """{}""" # noqa: D415 query_set = cls.filter() if hasattr(cls, "updated_at"): query_set = query_set.order_by("-updated_at") - return query_set[:limit].df(include=include, features=features, join=join) + return query_set[:limit].df(include=include, features=features) def _search( diff --git a/lamindb/_save.py b/lamindb/_save.py index 95ddb1a12..7ac2d3da9 100644 --- a/lamindb/_save.py +++ b/lamindb/_save.py @@ -112,6 +112,7 @@ def bulk_create(records: Iterable[Record], ignore_conflicts: bool | None = False records_by_orm[record.__class__].append(record) for registry, records in records_by_orm.items(): registry.objects.bulk_create(records, ignore_conflicts=ignore_conflicts) + # records[:] = created # In-place list update; does not seem to be necessary def bulk_update(records: Iterable[Record], ignore_conflicts: bool | None = False): diff --git a/sub/lnschema-core b/sub/lnschema-core index 2eb23a663..47ea16022 160000 --- a/sub/lnschema-core +++ b/sub/lnschema-core @@ -1 +1 @@ -Subproject commit 2eb23a6639098a1dd660eabecdf0b207194be949 +Subproject commit 47ea160227f223471cfdd7e66856bb3fb0514e35 diff --git a/tests/core/test_queryset.py b/tests/core/test_queryset.py index 0dbc4e1a5..d34334863 100644 --- a/tests/core/test_queryset.py +++ b/tests/core/test_queryset.py @@ -8,30 +8,23 @@ def test_df(): - # for self-referential models - project_label = ln.ULabel(name="Project") - project_label.save() + project_label = ln.ULabel(name="project").save() project_names = [f"Project {i}" for i in range(3)] - labels = [ln.ULabel(name=name) for name in project_names] - ln.save(labels) - for label in labels: - label.parents.add(project_label) - df = ln.ULabel.filter().df(include="parents__name") + labels = ln.ULabel.from_values(project_names, create=True).save() + project_label.children.add(*labels) + df = ln.ULabel.df(include="parents__name") assert df.columns[0] == "parents__name" - # order is not conserved - assert df["parents__name"].iloc[0] == [project_label.name] - # pass a list - df = ln.ULabel.filter().df(include=["parents__name", "parents__created_by_id"]) + assert df["parents__name"].iloc[0] == {project_label.name} + df = ln.ULabel.df(include=["parents__name", "parents__created_by_id"]) assert df.columns[1] == "parents__created_by_id" - assert df["parents__name"].iloc[0] == [project_label.name] + assert df["parents__name"].iloc[0] == {project_label.name} assert set(df["parents__created_by_id"].iloc[0]) == {current_user_id()} # for other models feature_names = [f"Feature {i}" for i in range(3)] features = [ln.Feature(name=name, dtype=int) for name in feature_names] ln.save(features) - feature_set = ln.FeatureSet(features, name="my feature_set") - feature_set.save() + feature_set = ln.FeatureSet(features, name="my feature_set").save() feature_set.features.set(features) df = ln.FeatureSet.filter(name="my feature_set").df(include="features__name") @@ -48,18 +41,8 @@ def test_df(): # inner join parents on features df = ln.FeatureSet.filter().df( - include=["features__name", "features__created_by_id"], join="inner" - ) - print(df) - assert set(df["features__name"].iloc[0]) == set(feature_names) - assert set(df["features__created_by_id"].iloc[0]) == {current_user_id()} - - # outer join parents on features (this test should be expanded to make it - # actually relevant) - df = ln.FeatureSet.filter().df( - include=["features__name", "features__created_by_id"], join="outer" + include=["features__name", "features__created_by_id"] ) - print(df) assert set(df["features__name"].iloc[0]) == set(feature_names) assert set(df["features__created_by_id"].iloc[0]) == {current_user_id()} @@ -68,14 +51,16 @@ def test_df(): assert df["created_by__name"].iloc[0] == "Test User1" # do not return fields with no data in the registry - df = ( - ln.Artifact.using("laminlabs/cellxgene") - .filter(suffix=".h5ad") - .df(include=["tissues__name", "pathways__name"]) - ) - assert "tissues__name" in df.columns - assert "pathways__name" not in df.columns - assert df.shape[0] > 0 + # does not make sense in Alex's opinion + # too much magic; got removed in https://github.com/laminlabs/lamindb/pull/2238 + # df = ( + # ln.Artifact.using("laminlabs/cellxgene") + # .filter(suffix=".h5ad") + # .df(include=["tissues__name", "pathways__name"]) + # ) + # assert "tissues__name" in df.columns + # assert "pathways__name" not in df.columns + # assert df.shape[0] > 0 # clean up project_label.delete() From 5d1fc8470a9c6827e1bca85e7ea5de5a0cff87a4 Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Fri, 29 Nov 2024 22:44:34 +0100 Subject: [PATCH 05/10] =?UTF-8?q?=F0=9F=93=9D=20Polish?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/registries.ipynb | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/registries.ipynb b/docs/registries.ipynb index bcc670734..6d00ecfa1 100644 --- a/docs/registries.ipynb +++ b/docs/registries.ipynb @@ -137,7 +137,7 @@ "id": "682d8295", "metadata": {}, "source": [ - "To join values from other registries into this overview, pass them via the `include` parameter." + "To join fields from other registries into this overview, pass them via the `include` argument." ] }, { @@ -159,7 +159,7 @@ "id": "aa3954d7", "metadata": {}, "source": [ - "If you'd like to see which artifacts measure which features, use parameter `features`." + "If you'd like to see which artifacts measure which features, use the `features` argument." ] }, { @@ -210,9 +210,7 @@ "id": "5cb551b4", "metadata": {}, "source": [ - "For registries with less than 100k records, auto-completing a `Lookup` object is the most convenient way of finding a record.\n", - "\n", - "For example, take the `User` registry:" + "For registries with less than 100k records, auto-completing a `Lookup` object is the most convenient way of finding a record." ] }, { From 6494cbd2af8ec92cc368ac87ca4bee5e7d3fe5f9 Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Fri, 29 Nov 2024 23:08:24 +0100 Subject: [PATCH 06/10] =?UTF-8?q?=F0=9F=92=9A=20Fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/registries.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/registries.ipynb b/docs/registries.ipynb index 6d00ecfa1..897c04b41 100644 --- a/docs/registries.ipynb +++ b/docs/registries.ipynb @@ -292,7 +292,7 @@ }, "outputs": [], "source": [ - "print(study.uid)\n", + "print(study1.uid)\n", "\n", "# by uid\n", "ln.ULabel.get(study1.uid)\n", From 3eda92f7176c6ab0bd532deeb151e7a3a65fe727 Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Fri, 29 Nov 2024 23:13:49 +0100 Subject: [PATCH 07/10] =?UTF-8?q?=F0=9F=92=9A=20Fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/registries.ipynb | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/docs/registries.ipynb b/docs/registries.ipynb index 897c04b41..2478cc24c 100644 --- a/docs/registries.ipynb +++ b/docs/registries.ipynb @@ -298,7 +298,7 @@ "ln.ULabel.get(study1.uid)\n", "\n", "# by field\n", - "ln.ULabel.get(name=\"Canidate marker study 1\")" + "ln.ULabel.get(name=\"Candidate marker study 1\")" ] }, { @@ -330,7 +330,7 @@ }, "outputs": [], "source": [ - "ln.Artifact.filter(ulabels=user).df()" + "ln.Artifact.filter(ulabels=study1).df()" ] }, { @@ -528,7 +528,7 @@ }, "outputs": [], "source": [ - "ln.Artifact.filter(suffix=\".jpg\", created_by=user).df()" + "ln.Artifact.filter(suffix=\".h5ad\", ulabels=study1).df()" ] }, { @@ -546,7 +546,7 @@ "id": "fdd61cf1-d3c7-4bfb-a0b2-14e81201db03", "metadata": {}, "source": [ - "Or subset to artifacts smaller than 10kB. Here, we can't use keyword arguments, but need an explicit where statement." + "Or subset to artifacts greater than 10kB. Here, we can't use keyword arguments, but need an explicit where statement." ] }, { @@ -560,7 +560,7 @@ }, "outputs": [], "source": [ - "ln.Artifact.filter(created_by=user, size__lt=1e4).df()" + "ln.Artifact.filter(ulabels=study1, size__gt=1e4).df()" ] }, { @@ -606,7 +606,7 @@ }, "outputs": [], "source": [ - "ln.Artifact.filter().order_by(\"-updated_at\").df()" + "ln.Artifact.filter().order_by(\"-created_at\").df()" ] }, { @@ -721,21 +721,13 @@ "ln.Artifact.filter(~ln.Q(suffix=\".jpg\")).df()" ] }, - { - "cell_type": "markdown", - "id": "ffb9c7a9", - "metadata": {}, - "source": [ - "Clean up the test instance." - ] - }, { "cell_type": "code", "execution_count": null, "id": "72b03f8b", "metadata": { "tags": [ - "hide-output" + "hide-cell" ] }, "outputs": [], From 67b28e59a45ae3500ddea742b7a708d24eb6821f Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Sat, 30 Nov 2024 00:02:58 +0100 Subject: [PATCH 08/10] =?UTF-8?q?=F0=9F=92=9A=20Fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/faq/search.ipynb | 89 ++++++++++++----------------- docs/registries.ipynb | 127 +++++++++++++++++++++--------------------- lamindb/_query_set.py | 10 ++-- 3 files changed, 103 insertions(+), 123 deletions(-) diff --git a/docs/faq/search.ipynb b/docs/faq/search.ipynb index 30dd2e157..c048624f3 100644 --- a/docs/faq/search.ipynb +++ b/docs/faq/search.ipynb @@ -8,33 +8,19 @@ "# How does search work?" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "955b6253", - "metadata": {}, - "outputs": [], - "source": [ - "from laminci.db import setup_local_test_postgres" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f312433", - "metadata": {}, - "outputs": [], - "source": [ - "pgurl = setup_local_test_postgres()" - ] - }, { "cell_type": "code", "execution_count": null, "id": "286aeebc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-cell" + ] + }, "outputs": [], "source": [ + "from laminci.db import setup_local_test_postgres\n", + "pgurl = setup_local_test_postgres()\n", "!lamin init --name benchmark_search --db {pgurl} --schema bionty --storage ./benchmark_search" ] }, @@ -43,7 +29,7 @@ "id": "40234d47", "metadata": {}, "source": [ - "Here we show how to perform text search on `Record` and evaluate some search queries." + "Here we show how to perform text search on `Record` and evaluate some search queries for the {class}`bionty.CellType` ontology." ] }, { @@ -54,36 +40,32 @@ "outputs": [], "source": [ "import lamindb as ln\n", - "import bionty as bt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83a46a9a", - "metadata": {}, - "outputs": [], - "source": [ + "import bionty as bt\n", + "\n", + "SEARCH_QUERIES_EXACT = (\"t cell\", \"stem cell\", \"b cell\", \"regulatory B cell\", \"Be2 cell\", \"adipocyte\")\n", + "SEARCH_QUERIES_CONTAINS = (\"t cel\", \"t-cel\", \"neural\", \"kidney\", \"kidne\")\n", + "TOP_N = 20\n", + "\n", "bt.CellType.import_source()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "7991a73f", + "cell_type": "markdown", + "id": "799b528e", "metadata": {}, - "outputs": [], "source": [ - "SEARCH_QUERIES_EXACT = (\"t cell\", \"stem cell\", \"b cell\", \"regulatory B cell\", \"Be2 cell\", \"adipocyte\")\n", - "SEARCH_QUERIES_CONTAINS = (\"t cel\", \"t-cel\", \"neural\", \"kidney\", \"kidne\")\n", - "TOP_N = 20" + "## Search the registry" ] }, { "cell_type": "code", "execution_count": null, "id": "682fcd2f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ "for query in SEARCH_QUERIES_EXACT:\n", @@ -98,7 +80,11 @@ "cell_type": "code", "execution_count": null, "id": "ee059b0d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ "for query in SEARCH_QUERIES_CONTAINS:\n", @@ -116,29 +102,24 @@ "id": "f4e41c9b", "metadata": {}, "source": [ - "Also check `bionty` public ontologies search:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb5aa05a", - "metadata": {}, - "outputs": [], - "source": [ - "ct_public = bt.CellType.public()" + "## Search the public ontology" ] }, { "cell_type": "code", "execution_count": null, "id": "d42c3594", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ + "ct_public = bt.CellType.public()\n", + "\n", "df = ct_public.search(\"b cell\", limit=20)\n", "assert df.iloc[0][\"name\"] == \"B cell\"\n", - "\n", "df" ] }, diff --git a/docs/registries.ipynb b/docs/registries.ipynb index 2478cc24c..fecdab3ce 100644 --- a/docs/registries.ipynb +++ b/docs/registries.ipynb @@ -113,7 +113,7 @@ "id": "c6410d93", "metadata": {}, "source": [ - "The easiest way to get an overview over all datasets is by typing {meth}`~lamindb.Artifact.df`, which returns the 100 latest artifacts in the {class}`~lamindb.Artifact` registry." + "The easiest way to get an overview over all artifacts is by typing {meth}`~lamindb.Artifact.df`, which returns the 100 latest artifacts in the {class}`~lamindb.Artifact` registry." ] }, { @@ -137,7 +137,7 @@ "id": "682d8295", "metadata": {}, "source": [ - "To join fields from other registries into this overview, pass them via the `include` argument." + "You can include fields from other registries." ] }, { @@ -159,7 +159,7 @@ "id": "aa3954d7", "metadata": {}, "source": [ - "If you'd like to see which artifacts measure which features, use the `features` argument." + "You can include information about which artifact measures which `feature`." ] }, { @@ -174,7 +174,7 @@ "outputs": [], "source": [ "df = ln.Artifact.df(features=True)\n", - "ln.view(df) # for clarity, we visualize this with type annotations" + "ln.view(df) # for clarity, leverage ln.view() to display a dataframe with dtype annotations" ] }, { @@ -182,14 +182,18 @@ "id": "82f563e0", "metadata": {}, "source": [ - "Compare this with the underlying normalized view of the data." + "The flattened table that includes information from all relevant registries is easier to understand than the normalized data. For comparison, here is how to see the later." ] }, { "cell_type": "code", "execution_count": null, "id": "a9c04f85", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ "ln.view()" @@ -201,7 +205,7 @@ "id": "bbda4807", "metadata": {}, "source": [ - "## Look up metadata" + "## Auto-complete records" ] }, { @@ -224,6 +228,8 @@ }, "outputs": [], "source": [ + "import bionty as bt\n", + "\n", "# query the database for all ulabels or all cell types\n", "ulabels = ln.ULabel.lookup()\n", "cell_types = bt.CellType.lookup()" @@ -270,7 +276,7 @@ "id": "d54676dd", "metadata": {}, "source": [ - "## Query exactly one record" + "## Get one record" ] }, { @@ -307,7 +313,7 @@ "id": "45ac3b5c", "metadata": {}, "source": [ - "## Query sets of records" + "## Query multiple records" ] }, { @@ -344,7 +350,7 @@ "- {meth}`~lamindb.core.QuerySet.df`: A pandas `DataFrame` with each record in a row.\n", "- {meth}`~lamindb.core.QuerySet.all`: A {class}`~lamindb.core.QuerySet`.\n", "- {meth}`~lamindb.core.QuerySet.one`: Exactly one record. Will raise an error if there is none. Is equivalent to the `.get()` method shown above.\n", - "- {meth}`~lamindb.core.QuerySet.one_or_one`: Either one record or `None` if there is no query result." + "- {meth}`~lamindb.core.QuerySet.one_or_none`: Either one record or `None` if there is no query result." ] }, { @@ -387,7 +393,7 @@ "id": "a925a678", "metadata": {}, "source": [ - "Search the toy data:" + "You can search every registry via {meth}`~lamindb.core.Record.search`. For example, the `Artifact` registry." ] }, { @@ -405,42 +411,11 @@ ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "b8ba2bfe", + "id": "ed43c3fa", "metadata": {}, "source": [ - "Let us create 500 notebook objects with fake titles, save, and search them:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b0b0c0c", - "metadata": { - "tags": [ - "hide-output" - ] - }, - "outputs": [], - "source": [ - "transforms = [ln.Transform(name=title, type=\"notebook\") for title in ln.core.datasets.fake_bio_notebook_titles(n=500)]\n", - "ln.save(transforms)\n", - "\n", - "# search\n", - "ln.Transform.search(\"intestine\").df().head(5)" - ] - }, - { - "cell_type": "markdown", - "id": "4ba825ab", - "metadata": {}, - "source": [ - "```{note}\n", - "\n", - "Currently, the LaminHub UI search is more powerful than the search of the `lamindb` open-source package.\n", - "\n", - "```" + "Here is more background on search and examples for searching the entire cell type ontology: {doc}`/faq/search` " ] }, { @@ -449,7 +424,7 @@ "id": "f85478c0", "metadata": {}, "source": [ - "## Leverage relations" + "## Query related registries" ] }, { @@ -483,10 +458,49 @@ "id": "8aa6378a", "metadata": {}, "source": [ - "The filter selects all artifacts based on the users who ran the generating notebook.\n", + "The filter selects all artifacts based on the users who ran the generating notebook. Under the hood, in the SQL database, it's joining the artifact table with the user table.\n", "\n", - "Under the hood, in the SQL database, it's joining the artifact table with the run and the user table.\n", - "\n" + "Another typical example is querying all datasets that measure a particular feature. For instance, which datasets measure `\"CD8A\"`. Here is how to do it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32cda0d5", + "metadata": { + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "cd8a = bt.Gene.get(symbol=\"CD8A\")\n", + "# query for all feature sets that contain CD8A\n", + "feature_sets_with_cd8a = ln.FeatureSet.filter(genes=cd8a).all()\n", + "# get all artifacts \n", + "ln.Artifact.filter(feature_sets__in=feature_sets_with_cd8a).df()" + ] + }, + { + "cell_type": "markdown", + "id": "1fe9bef9", + "metadata": {}, + "source": [ + "Instead of splitting this across three queries, the double-underscore syntax allows you to define a path for one query." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c338cf8e", + "metadata": { + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "ln.Artifact.filter(feature_sets__genes__symbol=\"CD8A\").df()" ] }, { @@ -494,7 +508,7 @@ "id": "dc54f4f6", "metadata": {}, "source": [ - "## Comparators" + "## Filter operators" ] }, { @@ -720,21 +734,6 @@ "source": [ "ln.Artifact.filter(~ln.Q(suffix=\".jpg\")).df()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72b03f8b", - "metadata": { - "tags": [ - "hide-cell" - ] - }, - "outputs": [], - "source": [ - "!rm -r ./test-registries\n", - "!lamin delete --force test-registries" - ] } ], "metadata": { diff --git a/lamindb/_query_set.py b/lamindb/_query_set.py index 0ec86c499..d6a8f8a91 100644 --- a/lamindb/_query_set.py +++ b/lamindb/_query_set.py @@ -32,7 +32,7 @@ if TYPE_CHECKING: from collections.abc import Iterable - from lnschema_core.types import StrField, listLike + from lnschema_core.types import ListLike, StrField class MultipleResultsFound(Exception): @@ -354,7 +354,7 @@ def reshape_annotate_result( for col in feature_values.columns: if col in result.columns: continue - result.insert(3, col, feature_values[col]) + result.insert(0, col, feature_values[col]) # Handle links features if they exist links_features = [ @@ -425,7 +425,7 @@ def process_links_features( for feature_name in feature_names: mask = df[feature_col] == feature_name feature_values = df[mask].groupby("id")[value_col].agg(set) - result.insert(3, feature_name, result["id"].map(feature_values)) + result.insert(0, feature_name, result["id"].map(feature_values)) return result @@ -583,7 +583,7 @@ def lookup(self, field: StrField | None = None, **kwargs) -> NamedTuple: @doc_args(CanCurate.validate.__doc__) -def validate(self, values: listLike, field: str | StrField | None = None, **kwargs): +def validate(self, values: ListLike, field: str | StrField | None = None, **kwargs): """{}""" # noqa: D415 from ._can_curate import _validate @@ -591,7 +591,7 @@ def validate(self, values: listLike, field: str | StrField | None = None, **kwar @doc_args(CanCurate.inspect.__doc__) -def inspect(self, values: listLike, field: str | StrField | None = None, **kwargs): +def inspect(self, values: ListLike, field: str | StrField | None = None, **kwargs): """{}""" # noqa: D415 from ._can_curate import _inspect From 993fe08ea492a10f5bad700d11e6e45f5f42bd2b Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Sat, 30 Nov 2024 02:53:03 +0100 Subject: [PATCH 09/10] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Upgrade=20lnschema-c?= =?UTF-8?q?ore=20and=20polish=20order?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/registries.ipynb | 4 ++-- lamindb/_query_set.py | 27 +++++++++++++++++++-------- sub/lnschema-core | 2 +- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/docs/registries.ipynb b/docs/registries.ipynb index fecdab3ce..9f22add91 100644 --- a/docs/registries.ipynb +++ b/docs/registries.ipynb @@ -151,7 +151,7 @@ }, "outputs": [], "source": [ - "ln.Artifact.df(include=[\"created_by__name\", \"ulabels__name\", \"cell_types__name\", \"feature_sets__registry\"])" + "ln.Artifact.df(include=[\"created_by__name\", \"ulabels__name\", \"cell_types__name\", \"feature_sets__registry\", \"suffix\"])" ] }, { @@ -174,7 +174,7 @@ "outputs": [], "source": [ "df = ln.Artifact.df(features=True)\n", - "ln.view(df) # for clarity, leverage ln.view() to display a dataframe with dtype annotations" + "ln.view(df) # for clarity, leverage ln.view() to display dtype annotations" ] }, { diff --git a/lamindb/_query_set.py b/lamindb/_query_set.py index d6a8f8a91..46bdf92ed 100644 --- a/lamindb/_query_set.py +++ b/lamindb/_query_set.py @@ -189,7 +189,9 @@ def save(self) -> RecordList[T]: return self -def get_basic_field_names(qs: QuerySet) -> list[str]: +def get_basic_field_names( + qs: QuerySet, include: list[str], features: bool | list[str] = False +) -> list[str]: exclude_field_names = ["updated_at"] field_names = [ field.name @@ -218,6 +220,11 @@ def get_basic_field_names(qs: QuerySet) -> list[str]: if field_names[0] != "uid" and "uid" in field_names: field_names.remove("uid") field_names.insert(0, "uid") + if include or features: + subset_field_names = field_names[:4] + intersection = set(field_names) & set(include) + subset_field_names += list(intersection) + field_names = subset_field_names return field_names @@ -354,7 +361,7 @@ def reshape_annotate_result( for col in feature_values.columns: if col in result.columns: continue - result.insert(0, col, feature_values[col]) + result.insert(4, col, feature_values[col]) # Handle links features if they exist links_features = [ @@ -425,7 +432,7 @@ def process_links_features( for feature_name in feature_names: mask = df[feature_col] == feature_name feature_values = df[mask].groupby("id")[value_col].agg(set) - result.insert(0, feature_name, result["id"].map(feature_values)) + result.insert(4, feature_name, result["id"].map(feature_values)) return result @@ -437,9 +444,11 @@ def process_extra_columns( for col, col_type in extra_columns.items(): if col not in df.columns: continue + if col in result.columns: + continue values = df.groupby("id")[col].agg(set if col_type == "many" else "first") - result.insert(0, col, result["id"].map(values)) + result.insert(4, col, result["id"].map(values)) return result @@ -465,15 +474,17 @@ def df( features: bool | list[str] = False, ) -> pd.DataFrame: """{}""" # noqa: D415 - field_names = get_basic_field_names(self) + if include is None: + include = [] + elif isinstance(include, str): + include = [include] + field_names = get_basic_field_names(self, include, features) annotate_kwargs = {} if features: annotate_kwargs.update(get_feature_annotate_kwargs(features)) if include: - if isinstance(include, str): - include = [include] include = include.copy()[::-1] - include_kwargs = {s: F(s) for s in include} + include_kwargs = {s: F(s) for s in include if s not in field_names} annotate_kwargs.update(include_kwargs) if annotate_kwargs: queryset = self.annotate(**annotate_kwargs) diff --git a/sub/lnschema-core b/sub/lnschema-core index 47ea16022..0b718bc5c 160000 --- a/sub/lnschema-core +++ b/sub/lnschema-core @@ -1 +1 @@ -Subproject commit 47ea160227f223471cfdd7e66856bb3fb0514e35 +Subproject commit 0b718bc5ca5b48ccfc899665149de236f26384a3 From 66045c9d32a955f734709488fd561220aa82917b Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Sat, 30 Nov 2024 02:56:32 +0100 Subject: [PATCH 10/10] =?UTF-8?q?=F0=9F=92=9A=20Fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/core/test_queryset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/core/test_queryset.py b/tests/core/test_queryset.py index d34334863..1dd666f2d 100644 --- a/tests/core/test_queryset.py +++ b/tests/core/test_queryset.py @@ -13,10 +13,10 @@ def test_df(): labels = ln.ULabel.from_values(project_names, create=True).save() project_label.children.add(*labels) df = ln.ULabel.df(include="parents__name") - assert df.columns[0] == "parents__name" + assert df.columns[3] == "parents__name" assert df["parents__name"].iloc[0] == {project_label.name} df = ln.ULabel.df(include=["parents__name", "parents__created_by_id"]) - assert df.columns[1] == "parents__created_by_id" + assert df.columns[4] == "parents__created_by_id" assert df["parents__name"].iloc[0] == {project_label.name} assert set(df["parents__created_by_id"].iloc[0]) == {current_user_id()} @@ -28,14 +28,14 @@ def test_df(): feature_set.features.set(features) df = ln.FeatureSet.filter(name="my feature_set").df(include="features__name") - assert df.columns[0] == "features__name" + assert df.columns[3] == "features__name" # order is not conserved assert set(df["features__name"].iloc[0]) == set(feature_names) # pass a list df = ln.FeatureSet.filter(name="my feature_set").df( include=["features__name", "features__created_by_id"] ) - assert df.columns[1] == "features__created_by_id" + assert df.columns[4] == "features__created_by_id" assert set(df["features__name"].iloc[0]) == set(feature_names) assert set(df["features__created_by_id"].iloc[0]) == {current_user_id()}