diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index bd87c20..8f4a78a 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -5,9 +5,9 @@ name: Python application on: push: - branches: [ main ] + branches: [ main, dev ] pull_request: - branches: [ main ] + branches: [ main, dev ] jobs: build: diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 5fb3c65..1a99076 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -5,9 +5,9 @@ name: Python package on: push: - branches: [ main ] + branches: [ main, dev ] pull_request: - branches: [ main ] + branches: [ main, dev ] jobs: build: diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py index 5681085..5a6b518 100644 --- a/sdrf_pipelines/__init__.py +++ b/sdrf_pipelines/__init__.py @@ -1 +1 @@ -__version__ = "0.0.24" +__version__ = "0.0.25" diff --git a/sdrf_pipelines/sdrf/sdrf.py b/sdrf_pipelines/sdrf/sdrf.py index e64bb56..f4f34ab 100644 --- a/sdrf_pipelines/sdrf/sdrf.py +++ b/sdrf_pipelines/sdrf/sdrf.py @@ -1,6 +1,10 @@ +from __future__ import annotations + import logging import pandas as pd +from pandas import DataFrame +from pandas._typing import PythonFuncType from sdrf_pipelines.sdrf.sdrf_schema import CELL_LINES_TEMPLATE from sdrf_pipelines.sdrf.sdrf_schema import HUMAN_TEMPLATE diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py index 2c3cf8b..d37fa84 100644 --- a/sdrf_pipelines/sdrf/sdrf_schema.py +++ b/sdrf_pipelines/sdrf/sdrf_schema.py @@ -1,5 +1,6 @@ import logging import re +import sys import typing from typing import Any @@ -145,8 +146,8 @@ def validate(self, series: pd.Series) -> pd.Series: if ontology_terms is not None: query_labels = [o["label"].lower() for o in ontology_terms] - for label in query_labels: - labels.append(label) + if term[TERM_NAME] in query_labels: + labels.append(term[TERM_NAME]) if self._not_available: labels.append(NOT_AVAILABLE) if self._not_applicable: @@ -179,6 +180,10 @@ def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]: ) errors.append(LogicError(error_message, error_type=logging.WARN)) + empty_cells_errors = self.validate_empty_cells(panda_sdrf) + if empty_cells_errors: + errors.extend(empty_cells_errors) + # Check the mandatory fields error_mandatory = self.validate_mandatory_columns(panda_sdrf) if error_mandatory is not None: @@ -310,6 +315,37 @@ def check_recommendations(self, panda_sdrf): warnings += column.validate_optional(series) return sorted(warnings, key=lambda e: e.row) + def validate_empty_cells(self, panda_sdrf): + """ + Check for empty cells in the SDRF. This method will return a list of errors if any empty cell is found. + :param panda_sdrf: SDRF dataframe + :return: List of errors + """ + errors = [] + + def validate_string(cell_value): + return cell_value is not None and cell_value != "nan" and len(cell_value.strip()) > 0 + + if sys.version_info <= (3, 8): + # Use map for Python versions less than 3.8 + validation_results = panda_sdrf.map(validate_string) + else: + # Use applymap for Python versions 3.8 and above + validation_results = panda_sdrf.applymap(validate_string) + + # Get the indices where the validation fails + failed_indices = [ + (row, col) + for row in validation_results.index + for col in validation_results.columns + if not validation_results.at[row, col] + ] + + for row, col in failed_indices: + message = f"Empty value found Row: {row}, Column: {col}" + errors.append(LogicError(message, error_type=logging.ERROR)) + return errors + default_schema = SDRFSchema( [ diff --git a/sdrf_pipelines/zooma/ols.py b/sdrf_pipelines/zooma/ols.py index c06084a..ace77e1 100644 --- a/sdrf_pipelines/zooma/ols.py +++ b/sdrf_pipelines/zooma/ols.py @@ -106,13 +106,16 @@ def get_ancestors(self, ont, iri): def search( self, - name, + name: str, query_fields=None, - ontology=None, + ontology: str = None, field_list=None, children_of=None, - exact=None, - bytype="class", + exact: bool = None, + bytype: str = "class", + rows: int = 10, + num_retries: int = 10, + start: int = 0, ): """ Searches the OLS with the given term @@ -124,6 +127,8 @@ def search( @:param exact: Forces exact match if not `None` @:param bytype: restrict to terms one of {class,property,individual,ontology} @:param childrenOf: Search only under a certain term. + @:param rows: number of rows to query on each call of OLS search + @:param num_retries: Number of retries to OLS when it fails. """ params = {"q": name} if ontology is not None: @@ -135,6 +140,9 @@ def search( if bytype: params["type"] = _concat_str_or_list(bytype) + if rows: + params["rows"] = rows + if ontology: params["ontology"] = _concat_str_or_list(ontology) elif self.ontology: @@ -155,26 +163,65 @@ def search( if len(children_of) > 0: params["childrenOf"] = _concat_str_or_list(children_of) - retry_num = 0 + if start: + params["start"] = start + + docs_found = [] - while retry_num < 10: + for retry_num in range(num_retries): try: req = self.session.get(self.ontology_search, params=params) - logger.debug("Request to OLS search API: %s - %s", req.status_code, name) + logger.debug("Request to OLS search API term %s, status code %s", name, req.status_code) - req.raise_for_status() - if req.json()["response"]["numFound"]: - return req.json()["response"]["docs"] - if exact: - logger.debug("OLS exact search returned empty response for %s", name) + if req.status_code != 200: + logger.error("OLS search term %s error tried number %s", name, retry_num) + req.raise_for_status() else: - logger.debug("OLS search returned empty response for %s", name) - return None + if req.json()["response"]["numFound"] == 0: + if exact: + logger.debug("OLS exact search returned empty response for %s", name) + else: + logger.debug("OLS search returned empty response for %s", name) + return docs_found + elif len(req.json()["response"]["docs"]) < rows: + return req.json()["response"]["docs"] + else: + docs_found = req.json()["response"]["docs"] + docs_found.extend( + self.search( + name, + query_fields=query_fields, + ontology=ontology, + field_list=field_list, + children_of=children_of, + exact=exact, + bytype=bytype, + rows=rows, + num_retries=num_retries, + start=(rows + (start)), + ) + ) + return docs_found + + if req.status_code == 200 and req.json()["response"]["numFound"] == 0: + if exact: + logger.debug("OLS exact search returned empty response for %s", name) + else: + logger.debug("OLS search returned empty response for %s", name) + return None + elif req.status_code != 200 and req.json()["response"]["numFound"] > 0: + if len(req.json()["response"]["docs"]) <= rows: + return req.json()["response"]["docs"] + else: + start = 0 + docs_found = req.json()["response"]["docs"] + except Exception as ex: - retry_num += 1 - logger.debug("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex) + logger.exception( + "OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex + ) - return None + return docs_found def suggest(self, name, ontology=None): """Suggest terms from an optional list of ontologies