Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

⚡️ Add boolean field IsVersioned.is_latest to speed up queries for latest versions #1811

Merged
merged 11 commits into from
Aug 13, 2024
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
- "faq"
- "storage"
- "cli"
timeout-minutes: 6
timeout-minutes: 7

steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion lamindb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"""

# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
__version__ = "0.75.1"
__version__ = "0.76a1"

import os as _os

Expand Down
1 change: 1 addition & 0 deletions lamindb/_artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
kwargs["description"] = description
kwargs["visibility"] = visibility
kwargs["_accessor"] = accessor
kwargs["is_new_version_of"] = is_new_version_of
# this check needs to come down here because key might be populated from an
# existing file path during get_artifact_kwargs_from_data()
if (
Expand Down
1 change: 1 addition & 0 deletions lamindb/_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def __init__(
run=run,
version=version,
visibility=visibility,
is_new_version_of=is_new_version_of,
**kwargs,
)
collection._artifacts = artifacts
Expand Down
27 changes: 2 additions & 25 deletions lamindb/_query_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,10 @@ def one_or_none(self) -> Record | None:
else:
raise MultipleResultsFound(self.all())

def latest_version(self) -> RecordsList:
def latest_version(self) -> QuerySet:
"""Filter every version family by latest version."""
if issubclass(self.model, IsVersioned):
return filter_query_set_by_latest_version(self)
return self.filter(is_latest=True)
else:
raise ValueError("Record isn't subclass of `lamindb.core.IsVersioned`")

Expand Down Expand Up @@ -288,29 +288,6 @@ def standardize(
return _standardize(cls=self, values=values, field=field, **kwargs)


def filter_query_set_by_latest_version(ordered_query_set: QuerySet) -> RecordsList:
# evaluating length can be very costly, hence, the try-except block
try:
first_record = ordered_query_set[0]
except IndexError:
return ordered_query_set
records_in_view = {}
records_in_view[first_record.stem_uid] = first_record
for record in ordered_query_set:
# this overwrites user-provided ordering (relevant records ordered by a
# certain field will not show if they are not the latest version)
if record.stem_uid not in records_in_view:
records_in_view[record.stem_uid] = record
else:
if record.created_at > records_in_view[record.stem_uid].created_at:
# deleting the entry is needed to preserve the integrity of
# user-provided ordering
del records_in_view[record.stem_uid]
records_in_view[record.stem_uid] = record
list_records_in_view = RecordsList(records_in_view.values())
return list_records_in_view


models.QuerySet.df = QuerySet.df
models.QuerySet.list = QuerySet.list
models.QuerySet.first = QuerySet.first
Expand Down
55 changes: 52 additions & 3 deletions lamindb/_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import dj_database_url
import lamindb_setup as ln_setup
from django.db import connections
from django.db import connections, transaction
from django.db.models import IntegerField, Manager, Q, QuerySet, Value
from lamin_utils import logger
from lamin_utils._lookup import Lookup
Expand Down Expand Up @@ -132,7 +132,10 @@ def get(cls, idlike: int | str) -> Record:
else:
qs = filter(cls, uid__startswith=idlike)
if issubclass(cls, IsVersioned):
return qs.latest_version().one()
if len(idlike) <= cls._len_stem_uid:
return qs.latest_version().one()
else:
return qs.one()
else:
return qs.one()

Expand Down Expand Up @@ -526,7 +529,28 @@ def save(self, *args, **kwargs) -> Record:
if result is not None:
init_self_from_db(self, result)
else:
super(Record, self).save(*args, **kwargs)
# save versioned record
if isinstance(self, IsVersioned) and self._is_new_version_of is not None:
if self._is_new_version_of.is_latest:
is_new_version_of = self._is_new_version_of
else:
# need one additional request
is_new_version_of = self.__class__.objects.get(
is_latest=True, uid__startswith=self.stem_uid
)
logger.warning(
f"didn't pass the latest version in `is_new_version_of`, retrieved it: {is_new_version_of}"
)
is_new_version_of.is_latest = False
with transaction.atomic():
is_new_version_of._is_new_version_of = (
None # ensure we don't start a recursion
)
is_new_version_of.save()
super(Record, self).save(*args, **kwargs)
# save unversioned record
else:
super(Record, self).save(*args, **kwargs)
# perform transfer of many-to-many fields
# only supported for Artifact and Collection records
if db is not None and db != "default" and using_key is None:
Expand All @@ -551,6 +575,30 @@ def save(self, *args, **kwargs) -> Record:
return self


def delete(self) -> None:
"""Delete the record."""
# note that the logic below does not fire if a record is moved to the trash
# the idea is that moving a record to the trash should move its entire version family
# to the trash, whereas permanently deleting should default to only deleting a single record
# of a version family
# we can consider making it easy to permanently delete entire version families as well,
# but that's for another time
if isinstance(self, IsVersioned) and self.is_latest:
new_latest = (
self.__class__.filter(is_latest=False, uid__startswith=self.stem_uid)
.order_by("-created_at")
.first()
)
if new_latest is not None:
new_latest.is_latest = True
with transaction.atomic():
new_latest.save()
super(Record, self).delete()
logger.warning(f"new latest version is {new_latest}")
return None
super(Record, self).delete()


METHOD_NAMES = [
"__init__",
"filter",
Expand All @@ -559,6 +607,7 @@ def save(self, *args, **kwargs) -> Record:
"search",
"lookup",
"save",
"delete",
"from_values",
"using",
]
Expand Down
2 changes: 1 addition & 1 deletion lamindb/_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def __init__(transform: Transform, *args, **kwargs):
is_new_version_of: Transform | None = (
kwargs.pop("is_new_version_of") if "is_new_version_of" in kwargs else None
)
(kwargs.pop("initial_version_id") if "initial_version_id" in kwargs else None)
version: str | None = kwargs.pop("version") if "version" in kwargs else None
type: TransformType | None = kwargs.pop("type") if "type" in kwargs else "pipeline"
reference: str | None = kwargs.pop("reference") if "reference" in kwargs else None
Expand Down Expand Up @@ -55,6 +54,7 @@ def __init__(transform: Transform, *args, **kwargs):
reference=reference,
reference_type=reference_type,
_has_consciously_provided_uid=has_consciously_provided_uid,
is_new_version_of=is_new_version_of,
)


Expand Down
1 change: 0 additions & 1 deletion lamindb/core/versioning.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def set_version(version: str | None = None, previous_version: str | None = None)
return version


# uses `initial_version_id` to extract a stem_id that's part of id
def init_uid(
*,
version: str | None = None,
Expand Down
3 changes: 1 addition & 2 deletions tests/test_artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,8 +554,7 @@ def test_create_big_file_from_remote_path():
filepath_str = "s3://lamindb-test/human_immune.h5ad"
artifact = ln.Artifact(filepath_str)
assert artifact.key == "human_immune.h5ad"
assert artifact.hash.endswith("-2")
assert artifact._hash_type == "md5-n"
assert artifact._hash_type == "md5-2"
ln.settings.storage = previous_storage


Expand Down
22 changes: 19 additions & 3 deletions tests/test_versioning.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,26 +87,42 @@ def test_latest_version_and_get():
# build one version family
transform_v1 = ln.Transform(name="Introduction")
transform_v1.save()
transform_v2 = ln.Transform(name="Introduction", is_new_version_of=transform_v1)
assert transform_v1.is_latest
assert transform_v1.version is None
# pass the latest version, also vary the name for the fun of it
transform_v2 = ln.Transform(name="Introduction v2", is_new_version_of=transform_v1)
transform_v2.save()
transform_v3 = ln.Transform(name="Introduction", is_new_version_of=transform_v2)
assert not transform_v1.is_latest
assert transform_v2.is_latest
assert transform_v2.version == "2"
# do not pass the latest version to is_new_version_of
transform_v3 = ln.Transform(name="Introduction", is_new_version_of=transform_v1)
transform_v3.save()
assert not ln.Transform.objects.get(name="Introduction v2", version="2").is_latest
assert transform_v3.is_latest
transform_v4 = ln.Transform(name="Introduction")
transform_v4.save()
assert transform_v4.is_latest
# add another transform with the same name that's not part of this family
# but will also be a hit for the query
assert len(ln.Transform.filter(name="Introduction").all()) == 4
assert len(ln.Transform.filter(name="Introduction").all()) == 3
assert len(ln.Transform.filter(name="Introduction").latest_version()) == 2
transform_v4.delete()
with pytest.raises(MultipleResultsFound):
ln.Transform.filter(name="Introduction").one()
assert (
ln.Transform.filter(name="Introduction").latest_version().one() == transform_v3
)

# test get
assert ln.Transform.get(transform_v3.uid) == transform_v3
assert ln.Transform.get(transform_v3.id) == transform_v3
assert ln.Transform.get(transform_v3.uid[:4]) == transform_v3

# test delete
transform_v3.delete()
assert transform_v2.is_latest

# test empty QuerySet
assert (
ln.Transform.filter(name="IntroductionNotExists").latest_version().one_or_none()
Expand Down
Loading