Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IntegrityError: UNIQUE constraint failed: bionty_gene.ensembl_gene_id when repeatedly validating example dataset #2137

Closed
Zethson opened this issue Nov 7, 2024 · 3 comments · Fixed by #2312
Assignees

Comments

@Zethson
Copy link
Member

Zethson commented Nov 7, 2024

Report

!lamin init --storage ./run-tests --name run-tests --schema bionty

import lamindb as ln
import bionty as bt

adata = ln.core.datasets.anndata_pbmc68k_reduced()

curator = ln.Curator.from_anndata(adata, var_index=bt.Gene.ensembl_gene_id, organism="human")
curator.validate()
curator.validate()

leads to

{
	"name": "IntegrityError",
	"message": "UNIQUE constraint failed: bionty_gene.ensembl_gene_id",
	"stack": "---------------------------------------------------------------------------
IntegrityError                            Traceback (most recent call last)
File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:105, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
    104 else:
--> 105     return self.cursor.execute(sql, params)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/sqlite3/base.py:354, in SQLiteCursorWrapper.execute(self, query, params)
    353 query = self.convert_query(query, param_names=param_names)
--> 354 return super().execute(query, params)

IntegrityError: UNIQUE constraint failed: bionty_gene.ensembl_gene_id

The above exception was the direct cause of the following exception:

IntegrityError                            Traceback (most recent call last)
Cell In[3], line 1
----> 1 curator.validate()

File ~/PycharmProjects/lamindb/lamindb/_curate.py:548, in AnnDataCurator.validate(self, organism)
    543     logger.important(
    544         f\"validating metadata using registries of instance {colors.italic(self._using_key)}\"
    545     )
    547 # add all validated records to the current instance
--> 548 self._update_registry_all()
    550 validated_var, non_validated_var = validate_categories(
    551     self._adata.var.index,
    552     field=self._var_field,
   (...)
    558     **self._kwargs,  # type: ignore
    559 )
    560 validated_obs, non_validated_obs = validate_categories_in_df(
    561     self._adata.obs,
    562     fields=self.categoricals,
   (...)
    566     **self._kwargs,
    567 )

File ~/PycharmProjects/lamindb/lamindb/_curate.py:517, in AnnDataCurator._update_registry_all(self, validated_only, **kwargs)
    515 \"\"\"Save labels for all features.\"\"\"
    516 logger.info(\"saving validated records of 'var_index'\")
--> 517 self._save_from_var_index(validated_only=validated_only, **self._kwargs)
    518 for name in self._obs_fields.keys():
    519     logger.info(f\"saving validated terms of '{name}'\")

File ~/PycharmProjects/lamindb/lamindb/_curate.py:502, in AnnDataCurator._save_from_var_index(self, validated_only, organism)
    498 def _save_from_var_index(
    499     self, validated_only: bool = True, organism: str | None = None
    500 ):
    501     \"\"\"Save variable records.\"\"\"
--> 502     update_registry(
    503         values=list(self._adata.var.index),
    504         field=self.var_index,
    505         key=\"var_index\",
    506         save_function=\".add_new_from_var_index()\",
    507         using_key=self._using_key,
    508         validated_only=validated_only,
    509         organism=organism,
    510         source=self._sources.get(\"var_index\"),
    511         exclude=self._exclude.get(\"var_index\"),
    512     )

File ~/PycharmProjects/lamindb/lamindb/_curate.py:1512, in update_registry(values, field, key, save_function, using_key, validated_only, df, organism, dtype, source, standardize, warning, exclude, **kwargs)
   1510 if source:
   1511     public_records = [r for r in public_records if r.source.uid == source.uid]
-> 1512 ln_save(public_records)
   1513 labels_saved[\"from public\"] = [
   1514     getattr(r, field.field.name) for r in public_records
   1515 ]
   1516 non_public_labels = [i for i in values if i not in labels_saved[\"from public\"]]

File ~/PycharmProjects/lamindb/lamindb/_save.py:83, in save(records, ignore_conflicts)
     79 if non_artifacts:
     80     non_artifacts_old, non_artifacts_new = partition(
     81         lambda r: r._state.adding or r.pk is None, non_artifacts
     82     )
---> 83     bulk_create(non_artifacts_new, ignore_conflicts=ignore_conflicts)
     84     if non_artifacts_old:
     85         bulk_update(non_artifacts_old)

File ~/PycharmProjects/lamindb/lamindb/_save.py:114, in bulk_create(records, ignore_conflicts)
    112     records_by_orm[record.__class__].append(record)
    113 for registry, records in records_by_orm.items():
--> 114     registry.objects.bulk_create(records, ignore_conflicts=ignore_conflicts)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/manager.py:87, in BaseManager._get_queryset_methods.<locals>.create_method.<locals>.manager_method(self, *args, **kwargs)
     85 @wraps(method)
     86 def manager_method(self, *args, **kwargs):
---> 87     return getattr(self.get_queryset(), name)(*args, **kwargs)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/query.py:835, in QuerySet.bulk_create(self, objs, batch_size, ignore_conflicts, update_conflicts, update_fields, unique_fields)
    833 if objs_without_pk:
    834     fields = [f for f in fields if not isinstance(f, AutoField)]
--> 835     returned_columns = self._batched_insert(
    836         objs_without_pk,
    837         fields,
    838         batch_size,
    839         on_conflict=on_conflict,
    840         update_fields=update_fields,
    841         unique_fields=unique_fields,
    842     )
    843     connection = connections[self.db]
    844     if (
    845         connection.features.can_return_rows_from_bulk_insert
    846         and on_conflict is None
    847     ):

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/query.py:1875, in QuerySet._batched_insert(self, objs, fields, batch_size, on_conflict, update_fields, unique_fields)
   1870 for item in [objs[i : i + batch_size] for i in range(0, len(objs), batch_size)]:
   1871     if bulk_return and (
   1872         on_conflict is None or on_conflict == OnConflict.UPDATE
   1873     ):
   1874         inserted_rows.extend(
-> 1875             self._insert(
   1876                 item,
   1877                 fields=fields,
   1878                 using=self.db,
   1879                 on_conflict=on_conflict,
   1880                 update_fields=update_fields,
   1881                 unique_fields=unique_fields,
   1882                 returning_fields=self.model._meta.db_returning_fields,
   1883             )
   1884         )
   1885     else:
   1886         self._insert(
   1887             item,
   1888             fields=fields,
   (...)
   1892             unique_fields=unique_fields,
   1893         )

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/query.py:1847, in QuerySet._insert(self, objs, fields, returning_fields, raw, using, on_conflict, update_fields, unique_fields)
   1840 query = sql.InsertQuery(
   1841     self.model,
   1842     on_conflict=on_conflict,
   1843     update_fields=update_fields,
   1844     unique_fields=unique_fields,
   1845 )
   1846 query.insert_values(fields, objs, raw=raw)
-> 1847 return query.get_compiler(using=using).execute_sql(returning_fields)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/sql/compiler.py:1836, in SQLInsertCompiler.execute_sql(self, returning_fields)
   1834 with self.connection.cursor() as cursor:
   1835     for sql, params in self.as_sql():
-> 1836         cursor.execute(sql, params)
   1837     if not self.returning_fields:
   1838         return []

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:79, in CursorWrapper.execute(self, sql, params)
     78 def execute(self, sql, params=None):
---> 79     return self._execute_with_wrappers(
     80         sql, params, many=False, executor=self._execute
     81     )

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:92, in CursorWrapper._execute_with_wrappers(self, sql, params, many, executor)
     90 for wrapper in reversed(self.db.execute_wrappers):
     91     executor = functools.partial(wrapper, executor)
---> 92 return executor(sql, params, many, context)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:100, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
     98     warnings.warn(self.APPS_NOT_READY_WARNING_MSG, category=RuntimeWarning)
     99 self.db.validate_no_broken_transaction()
--> 100 with self.db.wrap_database_errors:
    101     if params is None:
    102         # params default might be backend specific.
    103         return self.cursor.execute(sql)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/utils.py:91, in DatabaseErrorWrapper.__exit__(self, exc_type, exc_value, traceback)
     89 if dj_exc_type not in (DataError, IntegrityError):
     90     self.wrapper.errors_occurred = True
---> 91 raise dj_exc_value.with_traceback(traceback) from exc_value

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:105, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
    103     return self.cursor.execute(sql)
    104 else:
--> 105     return self.cursor.execute(sql, params)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/sqlite3/base.py:354, in SQLiteCursorWrapper.execute(self, query, params)
    352 param_names = list(params) if isinstance(params, Mapping) else None
    353 query = self.convert_query(query, param_names=param_names)
--> 354 return super().execute(query, params)

IntegrityError: UNIQUE constraint failed: bionty_gene.ensembl_gene_id"
}

Version information

No response

@Zethson
Copy link
Member Author

Zethson commented Nov 7, 2024

I don't seem to have this issue with synthetic example data.

@sunnyosun
Copy link
Member

The issue with this example is the var.index is symbol not ensembl id! So your call was wrong.

It should be:

curator = ln.Curator.from_anndata(adata, var_index=bt.Gene.symbol, organism="human")
curator.validate()

and it works.

You could tell by the logging when doing the wrong call, as it tries to map synonyms from name to ensembl ID:
Screenshot 2025-01-02 at 16 59 00

@sunnyosun sunnyosun reopened this Jan 2, 2025
@sunnyosun
Copy link
Member

This PR makes sure that map synonyms only perform on name field: #2312

Screenshot 2025-01-02 at 19 45 35

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants