Skip to content

Commit

Permalink
⚡️ Add boolean field IsVersioned.is_latest to speed up queries for …
Browse files Browse the repository at this point in the history
…latest versions (#1811)
  • Loading branch information
falexwolf authored Aug 13, 2024
1 parent 0cf49e6 commit f03ae91
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 38 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
- "faq"
- "storage"
- "cli"
timeout-minutes: 6
timeout-minutes: 7

steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion lamindb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"""

# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
__version__ = "0.75.1"
__version__ = "0.76a1"

import os as _os

Expand Down
1 change: 1 addition & 0 deletions lamindb/_artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
kwargs["description"] = description
kwargs["visibility"] = visibility
kwargs["_accessor"] = accessor
kwargs["is_new_version_of"] = is_new_version_of
# this check needs to come down here because key might be populated from an
# existing file path during get_artifact_kwargs_from_data()
if (
Expand Down
1 change: 1 addition & 0 deletions lamindb/_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def __init__(
run=run,
version=version,
visibility=visibility,
is_new_version_of=is_new_version_of,
**kwargs,
)
collection._artifacts = artifacts
Expand Down
27 changes: 2 additions & 25 deletions lamindb/_query_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,10 @@ def one_or_none(self) -> Record | None:
else:
raise MultipleResultsFound(self.all())

def latest_version(self) -> RecordsList:
def latest_version(self) -> QuerySet:
"""Filter every version family by latest version."""
if issubclass(self.model, IsVersioned):
return filter_query_set_by_latest_version(self)
return self.filter(is_latest=True)
else:
raise ValueError("Record isn't subclass of `lamindb.core.IsVersioned`")

Expand Down Expand Up @@ -288,29 +288,6 @@ def standardize(
return _standardize(cls=self, values=values, field=field, **kwargs)


def filter_query_set_by_latest_version(ordered_query_set: QuerySet) -> RecordsList:
# evaluating length can be very costly, hence, the try-except block
try:
first_record = ordered_query_set[0]
except IndexError:
return ordered_query_set
records_in_view = {}
records_in_view[first_record.stem_uid] = first_record
for record in ordered_query_set:
# this overwrites user-provided ordering (relevant records ordered by a
# certain field will not show if they are not the latest version)
if record.stem_uid not in records_in_view:
records_in_view[record.stem_uid] = record
else:
if record.created_at > records_in_view[record.stem_uid].created_at:
# deleting the entry is needed to preserve the integrity of
# user-provided ordering
del records_in_view[record.stem_uid]
records_in_view[record.stem_uid] = record
list_records_in_view = RecordsList(records_in_view.values())
return list_records_in_view


models.QuerySet.df = QuerySet.df
models.QuerySet.list = QuerySet.list
models.QuerySet.first = QuerySet.first
Expand Down
55 changes: 52 additions & 3 deletions lamindb/_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import dj_database_url
import lamindb_setup as ln_setup
from django.db import connections
from django.db import connections, transaction
from django.db.models import IntegerField, Manager, Q, QuerySet, Value
from lamin_utils import logger
from lamin_utils._lookup import Lookup
Expand Down Expand Up @@ -132,7 +132,10 @@ def get(cls, idlike: int | str) -> Record:
else:
qs = filter(cls, uid__startswith=idlike)
if issubclass(cls, IsVersioned):
return qs.latest_version().one()
if len(idlike) <= cls._len_stem_uid:
return qs.latest_version().one()
else:
return qs.one()
else:
return qs.one()

Expand Down Expand Up @@ -526,7 +529,28 @@ def save(self, *args, **kwargs) -> Record:
if result is not None:
init_self_from_db(self, result)
else:
super(Record, self).save(*args, **kwargs)
# save versioned record
if isinstance(self, IsVersioned) and self._is_new_version_of is not None:
if self._is_new_version_of.is_latest:
is_new_version_of = self._is_new_version_of
else:
# need one additional request
is_new_version_of = self.__class__.objects.get(
is_latest=True, uid__startswith=self.stem_uid
)
logger.warning(
f"didn't pass the latest version in `is_new_version_of`, retrieved it: {is_new_version_of}"
)
is_new_version_of.is_latest = False
with transaction.atomic():
is_new_version_of._is_new_version_of = (
None # ensure we don't start a recursion
)
is_new_version_of.save()
super(Record, self).save(*args, **kwargs)
# save unversioned record
else:
super(Record, self).save(*args, **kwargs)
# perform transfer of many-to-many fields
# only supported for Artifact and Collection records
if db is not None and db != "default" and using_key is None:
Expand All @@ -551,6 +575,30 @@ def save(self, *args, **kwargs) -> Record:
return self


def delete(self) -> None:
"""Delete the record."""
# note that the logic below does not fire if a record is moved to the trash
# the idea is that moving a record to the trash should move its entire version family
# to the trash, whereas permanently deleting should default to only deleting a single record
# of a version family
# we can consider making it easy to permanently delete entire version families as well,
# but that's for another time
if isinstance(self, IsVersioned) and self.is_latest:
new_latest = (
self.__class__.filter(is_latest=False, uid__startswith=self.stem_uid)
.order_by("-created_at")
.first()
)
if new_latest is not None:
new_latest.is_latest = True
with transaction.atomic():
new_latest.save()
super(Record, self).delete()
logger.warning(f"new latest version is {new_latest}")
return None
super(Record, self).delete()


METHOD_NAMES = [
"__init__",
"filter",
Expand All @@ -559,6 +607,7 @@ def save(self, *args, **kwargs) -> Record:
"search",
"lookup",
"save",
"delete",
"from_values",
"using",
]
Expand Down
2 changes: 1 addition & 1 deletion lamindb/_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def __init__(transform: Transform, *args, **kwargs):
is_new_version_of: Transform | None = (
kwargs.pop("is_new_version_of") if "is_new_version_of" in kwargs else None
)
(kwargs.pop("initial_version_id") if "initial_version_id" in kwargs else None)
version: str | None = kwargs.pop("version") if "version" in kwargs else None
type: TransformType | None = kwargs.pop("type") if "type" in kwargs else "pipeline"
reference: str | None = kwargs.pop("reference") if "reference" in kwargs else None
Expand Down Expand Up @@ -55,6 +54,7 @@ def __init__(transform: Transform, *args, **kwargs):
reference=reference,
reference_type=reference_type,
_has_consciously_provided_uid=has_consciously_provided_uid,
is_new_version_of=is_new_version_of,
)


Expand Down
1 change: 0 additions & 1 deletion lamindb/core/versioning.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def set_version(version: str | None = None, previous_version: str | None = None)
return version


# uses `initial_version_id` to extract a stem_id that's part of id
def init_uid(
*,
version: str | None = None,
Expand Down
3 changes: 1 addition & 2 deletions tests/test_artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,8 +554,7 @@ def test_create_big_file_from_remote_path():
filepath_str = "s3://lamindb-test/human_immune.h5ad"
artifact = ln.Artifact(filepath_str)
assert artifact.key == "human_immune.h5ad"
assert artifact.hash.endswith("-2")
assert artifact._hash_type == "md5-n"
assert artifact._hash_type == "md5-2"
ln.settings.storage = previous_storage


Expand Down
22 changes: 19 additions & 3 deletions tests/test_versioning.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,26 +87,42 @@ def test_latest_version_and_get():
# build one version family
transform_v1 = ln.Transform(name="Introduction")
transform_v1.save()
transform_v2 = ln.Transform(name="Introduction", is_new_version_of=transform_v1)
assert transform_v1.is_latest
assert transform_v1.version is None
# pass the latest version, also vary the name for the fun of it
transform_v2 = ln.Transform(name="Introduction v2", is_new_version_of=transform_v1)
transform_v2.save()
transform_v3 = ln.Transform(name="Introduction", is_new_version_of=transform_v2)
assert not transform_v1.is_latest
assert transform_v2.is_latest
assert transform_v2.version == "2"
# do not pass the latest version to is_new_version_of
transform_v3 = ln.Transform(name="Introduction", is_new_version_of=transform_v1)
transform_v3.save()
assert not ln.Transform.objects.get(name="Introduction v2", version="2").is_latest
assert transform_v3.is_latest
transform_v4 = ln.Transform(name="Introduction")
transform_v4.save()
assert transform_v4.is_latest
# add another transform with the same name that's not part of this family
# but will also be a hit for the query
assert len(ln.Transform.filter(name="Introduction").all()) == 4
assert len(ln.Transform.filter(name="Introduction").all()) == 3
assert len(ln.Transform.filter(name="Introduction").latest_version()) == 2
transform_v4.delete()
with pytest.raises(MultipleResultsFound):
ln.Transform.filter(name="Introduction").one()
assert (
ln.Transform.filter(name="Introduction").latest_version().one() == transform_v3
)

# test get
assert ln.Transform.get(transform_v3.uid) == transform_v3
assert ln.Transform.get(transform_v3.id) == transform_v3
assert ln.Transform.get(transform_v3.uid[:4]) == transform_v3

# test delete
transform_v3.delete()
assert transform_v2.is_latest

# test empty QuerySet
assert (
ln.Transform.filter(name="IntroductionNotExists").latest_version().one_or_none()
Expand Down

0 comments on commit f03ae91

Please sign in to comment.