From 463a10accbd689e7e3df3acd28160b036eda1872 Mon Sep 17 00:00:00 2001 From: Zhuoqing Fang Date: Wed, 4 Dec 2024 10:31:00 -0800 Subject: [PATCH] improve biomart --- gseapy/biomart.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/gseapy/biomart.py b/gseapy/biomart.py index fd0858e..943458d 100644 --- a/gseapy/biomart.py +++ b/gseapy/biomart.py @@ -73,7 +73,7 @@ def __init__(self, host: str = "www.ensembl.org", verbose: bool = False): self.reset() # get supported marts - self._marts = self.get_marts()["Mart"].to_list() + self._marts = None def __del__(self): handlers = self._logger.handlers[:] @@ -84,31 +84,29 @@ def __del__(self): def _set_host(self, host: str): """set host""" - hosts = ["www.ensembl.org", "asia.ensembl.org", "useast.ensembl.org"] + hosts = ["useast.ensembl.org", "asia.ensembl.org"] hosts.insert(0, host) - secure = "" - - # if self._secure: - # secure = "s" + secure = "s" # if host not work, select next i = 0 while i < len(hosts): - url = "http{}://{}/biomart/martservice".format(secure, hosts[i]) - request = requests.head(url) - if request.status_code in [200]: + url = "http{}://{}/biomart/martservice?type=registry".format( + secure, hosts[i] + ) + request = requests.get(url) + # '\n\n\n Service unavailable\n + # "\n\n" + if request.ok and request.text.startswith("\n\n"): self.host = hosts[i] break - else: - self._logger.warning( - "host {} is not reachable, will try {} ".format( - hosts[i], hosts[i % len(hosts)] - ) + self._logger.warning( + "host {} is not reachable, try {} ".format( + hosts[i], hosts[(i + 1) % len(hosts)] ) + ) i += 1 if i == len(hosts): - raise ValueError( - "host is not reachable. Please check your input or try again later." - ) + self._logger.warning("hosts is not reachable. Please try again later.") def add_filter(self, name: str, value: Iterable[str]): """ @@ -152,18 +150,21 @@ def get_marts(self): host=self.host, i=self._id ) resp = requests.get(url) - if resp.ok: - # marts = pd.read_xml(resp.text) + if resp.ok and resp.text.startswith("\n\n"): marts = [e.attrib for e in ET.XML(resp.text)] marts = pd.DataFrame(marts) marts = marts.loc[:, ["database", "displayName", "name"]] marts.columns = ["Version", "DisplayName", "Mart"] + # get supported marts + self._marts = marts["Mart"].to_list() return marts.loc[:, ["Mart", "Version"]] return resp.text def get_datasets(self, mart: str = "ENSEMBL_MART_ENSEMBL"): """Get available datasets from mart you've selected""" + if self._marts is None: + self.get_marts() if mart not in self._marts: raise ValueError( "Provided mart name (%s) is not valid. see 'names' attribute" % mart