Merge pull request #163 from bigbio/dev

small changes realted with OLS.
bigbio · Feb 28, 2024 · 07c1dea · 07c1dea
2 parents b20df9d + 3954a3b
commit 07c1dea
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 24 deletions.
diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
@@ -5,9 +5,9 @@ name: Python application
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main, dev ]
   pull_request:
-    branches: [ main ]
+    branches: [ main, dev ]
 
 jobs:
   build:

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -5,9 +5,9 @@ name: Python package
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main, dev ]
   pull_request:
-    branches: [ main ]
+    branches: [ main, dev ]
 
 jobs:
   build:

diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.24"
+__version__ = "0.0.25"
diff --git a/sdrf_pipelines/sdrf/sdrf.py b/sdrf_pipelines/sdrf/sdrf.py
@@ -1,6 +1,10 @@
+from __future__ import annotations
+
 import logging
 
 import pandas as pd
+from pandas import DataFrame
+from pandas._typing import PythonFuncType
 
 from sdrf_pipelines.sdrf.sdrf_schema import CELL_LINES_TEMPLATE
 from sdrf_pipelines.sdrf.sdrf_schema import HUMAN_TEMPLATE

diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py
@@ -1,5 +1,6 @@
 import logging
 import re
+import sys
 import typing
 from typing import Any
 
@@ -145,8 +146,8 @@ def validate(self, series: pd.Series) -> pd.Series:
 
             if ontology_terms is not None:
                 query_labels = [o["label"].lower() for o in ontology_terms]
-                for label in query_labels:
-                    labels.append(label)
+                if term[TERM_NAME] in query_labels:
+                    labels.append(term[TERM_NAME])
         if self._not_available:
             labels.append(NOT_AVAILABLE)
         if self._not_applicable:
@@ -179,6 +180,10 @@ def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]:
             )
             errors.append(LogicError(error_message, error_type=logging.WARN))
 
+        empty_cells_errors = self.validate_empty_cells(panda_sdrf)
+        if empty_cells_errors:
+            errors.extend(empty_cells_errors)
+
         # Check the mandatory fields
         error_mandatory = self.validate_mandatory_columns(panda_sdrf)
         if error_mandatory is not None:
@@ -310,6 +315,37 @@ def check_recommendations(self, panda_sdrf):
             warnings += column.validate_optional(series)
         return sorted(warnings, key=lambda e: e.row)
 
+    def validate_empty_cells(self, panda_sdrf):
+        """
+        Check for empty cells in the SDRF. This method will return a list of errors if any empty cell is found.
+        :param panda_sdrf: SDRF dataframe
+        :return: List of errors
+        """
+        errors = []
+
+        def validate_string(cell_value):
+            return cell_value is not None and cell_value != "nan" and len(cell_value.strip()) > 0
+
+        if sys.version_info <= (3, 8):
+            # Use map for Python versions less than 3.8
+            validation_results = panda_sdrf.map(validate_string)
+        else:
+            # Use applymap for Python versions 3.8 and above
+            validation_results = panda_sdrf.applymap(validate_string)
+
+        # Get the indices where the validation fails
+        failed_indices = [
+            (row, col)
+            for row in validation_results.index
+            for col in validation_results.columns
+            if not validation_results.at[row, col]
+        ]
+
+        for row, col in failed_indices:
+            message = f"Empty value found Row: {row}, Column: {col}"
+            errors.append(LogicError(message, error_type=logging.ERROR))
+        return errors
+
 
 default_schema = SDRFSchema(
     [

diff --git a/sdrf_pipelines/zooma/ols.py b/sdrf_pipelines/zooma/ols.py
@@ -106,13 +106,16 @@ def get_ancestors(self, ont, iri):
 
     def search(
         self,
-        name,
+        name: str,
         query_fields=None,
-        ontology=None,
+        ontology: str = None,
         field_list=None,
         children_of=None,
-        exact=None,
-        bytype="class",
+        exact: bool = None,
+        bytype: str = "class",
+        rows: int = 10,
+        num_retries: int = 10,
+        start: int = 0,
     ):
         """
         Searches the OLS with the given term
@@ -124,6 +127,8 @@ def search(
         @:param exact: Forces exact match if not `None`
         @:param bytype: restrict to terms one of {class,property,individual,ontology}
         @:param childrenOf: Search only under a certain term.
+        @:param rows: number of rows to query on each call of OLS search
+        @:param num_retries: Number of retries to OLS when it fails.
         """
         params = {"q": name}
         if ontology is not None:
@@ -135,6 +140,9 @@ def search(
         if bytype:
             params["type"] = _concat_str_or_list(bytype)
 
+        if rows:
+            params["rows"] = rows
+
         if ontology:
             params["ontology"] = _concat_str_or_list(ontology)
         elif self.ontology:
@@ -155,26 +163,65 @@ def search(
         if len(children_of) > 0:
             params["childrenOf"] = _concat_str_or_list(children_of)
 
-        retry_num = 0
+        if start:
+            params["start"] = start
+
+        docs_found = []
 
-        while retry_num < 10:
+        for retry_num in range(num_retries):
             try:
                 req = self.session.get(self.ontology_search, params=params)
-                logger.debug("Request to OLS search API: %s - %s", req.status_code, name)
+                logger.debug("Request to OLS search API term %s, status code %s", name, req.status_code)
 
-                req.raise_for_status()
-                if req.json()["response"]["numFound"]:
-                    return req.json()["response"]["docs"]
-                if exact:
-                    logger.debug("OLS exact search returned empty response for %s", name)
+                if req.status_code != 200:
+                    logger.error("OLS search term %s error tried number %s", name, retry_num)
+                    req.raise_for_status()
                 else:
-                    logger.debug("OLS search returned empty response for %s", name)
-                return None
+                    if req.json()["response"]["numFound"] == 0:
+                        if exact:
+                            logger.debug("OLS exact search returned empty response for %s", name)
+                        else:
+                            logger.debug("OLS search returned empty response for %s", name)
+                        return docs_found
+                    elif len(req.json()["response"]["docs"]) < rows:
+                        return req.json()["response"]["docs"]
+                    else:
+                        docs_found = req.json()["response"]["docs"]
+                        docs_found.extend(
+                            self.search(
+                                name,
+                                query_fields=query_fields,
+                                ontology=ontology,
+                                field_list=field_list,
+                                children_of=children_of,
+                                exact=exact,
+                                bytype=bytype,
+                                rows=rows,
+                                num_retries=num_retries,
+                                start=(rows + (start)),
+                            )
+                        )
+                        return docs_found
+
+                if req.status_code == 200 and req.json()["response"]["numFound"] == 0:
+                    if exact:
+                        logger.debug("OLS exact search returned empty response for %s", name)
+                    else:
+                        logger.debug("OLS search returned empty response for %s", name)
+                    return None
+                elif req.status_code != 200 and req.json()["response"]["numFound"] > 0:
+                    if len(req.json()["response"]["docs"]) <= rows:
+                        return req.json()["response"]["docs"]
+                    else:
+                        start = 0
+                        docs_found = req.json()["response"]["docs"]
+
             except Exception as ex:
-                retry_num += 1
-                logger.debug("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex)
+                logger.exception(
+                    "OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex
+                )
 
-        return None
+        return docs_found
 
     def suggest(self, name, ontology=None):
         """Suggest terms from an optional list of ontologies