Skip to content

Commit

Permalink
Merge pull request #174 from bigbio/dev
Browse files Browse the repository at this point in the history
small update in the cache ontologies and update in all of them
  • Loading branch information
ypriverol authored Oct 7, 2024
2 parents 528db8c + 20cbdf9 commit 24151ed
Show file tree
Hide file tree
Showing 16 changed files with 33 additions and 4 deletions.
2 changes: 1 addition & 1 deletion sdrf_pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.29"
__version__ = "0.0.30"
Binary file modified sdrf_pipelines/ols/bto.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/chebi.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/cl.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/clo.parquet
Binary file not shown.
Binary file removed sdrf_pipelines/ols/efo-base.parquet
Binary file not shown.
Binary file added sdrf_pipelines/ols/efo.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/mondo.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/ncbitaxon.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/ncit.parquet
Binary file not shown.
35 changes: 32 additions & 3 deletions sdrf_pipelines/ols/ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,24 @@ def build_ontology_index(ontology_file: str, output_file: str = None, ontology_n
terms = [term for term in terms if "label" in term]
df = pd.DataFrame(terms)

df.to_parquet(output_file, compression="gzip")
# Convert to lowercase as needed
df["accession"] = df["accession"].str.lower()
df["label"] = df["label"].str.lower()
df["ontology"] = df["ontology"].str.lower()

# Enforce data types (schema)
df["accession"] = df["accession"].astype("string") # Ensuring a string type
df["label"] = df["label"].astype("string") # Ensuring a string type
df["ontology"] = df["ontology"].astype("string") # Ensuring a string type

# Remove terms with no label or accession and print a warning
df = df.dropna(subset=["label", "accession"])
if df.empty:
logger.warning("No terms found in %s", ontology_file)
raise ValueError(f"No terms found in {ontology_file}")
logger.info("Terms found in %s: %s", ontology_file, len(df))

df.to_parquet(output_file, compression="gzip", index=False)
logger.info("Index has finished, output file: %s", output_file)

def besthit(self, name, **kwargs):
Expand Down Expand Up @@ -411,13 +428,25 @@ def cache_search(self, term: str, ontology: str, full_search: bool = False) -> l
return []

if ontology is not None:
# Query for case-insensitive search and ensure all fields are cast to string
duckdb_conn = duckdb.execute(
"""SELECT * FROM read_parquet(?) WHERE lower(label) = lower(?) AND lower(ontology) = lower(?)""",
"""SELECT CAST(accession AS VARCHAR) AS accession,
CAST(label AS VARCHAR) AS label,
CAST(ontology AS VARCHAR) AS ontology
FROM read_parquet(?)
WHERE lower(CAST(label AS VARCHAR)) = lower(?)
AND lower(CAST(ontology AS VARCHAR)) = lower(?)""",
(self.parquet_files, term, ontology),
)
else:
# Query for case-insensitive search without ontology
duckdb_conn = duckdb.execute(
"""SELECT * FROM read_parquet(?) WHERE lower(label) = lower(?)""", (self.parquet_files, term)
"""SELECT CAST(accession AS VARCHAR) AS accession,
CAST(label AS VARCHAR) AS label,
CAST(ontology AS VARCHAR) AS ontology
FROM read_parquet(?)
WHERE lower(CAST(label AS VARCHAR)) = lower(?)""",
(self.parquet_files, term),
)
df = duckdb_conn.fetchdf()

Expand Down
Binary file modified sdrf_pipelines/ols/pato.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/pride.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/psi-ms.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/uberon.parquet
Binary file not shown.
Binary file modified sdrf_pipelines/ols/unimod.parquet
Binary file not shown.

0 comments on commit 24151ed

Please sign in to comment.