From 3b6f9c3c575c229538717c442d06b1ce139cd653 Mon Sep 17 00:00:00 2001 From: Zhuoqing Fang Date: Wed, 4 Dec 2024 10:44:52 -0800 Subject: [PATCH] improve biomart --- gseapy/biomart.py | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/gseapy/biomart.py b/gseapy/biomart.py index 943458d..59a6931 100644 --- a/gseapy/biomart.py +++ b/gseapy/biomart.py @@ -98,6 +98,7 @@ def _set_host(self, host: str): # "\n\n" if request.ok and request.text.startswith("\n\n"): self.host = hosts[i] + self._marts =self._get_marts(request.text) break self._logger.warning( "host {} is not reachable, try {} ".format( @@ -143,29 +144,50 @@ def get_xml(self): xml += line xml += self.footer return xml + + def _get_mart(self, text:str): + """ + Parse the xml text and return a dataframe of supported marts. + + Parameters + ---------- + text : str + a xml text + + Returns + ------- + marts : pd.DataFrame + a dataframe of supported marts with columns: + - Mart: the name of mart + - Version: the version of mart + """ + marts = [e.attrib for e in ET.XML(text)] + marts = pd.DataFrame(marts) + marts = marts.loc[:, ["database", "displayName", "name"]] + marts.columns = ["Version", "DisplayName", "Mart"] + # get supported marts + return marts.loc[:, ["Mart", "Version"]] + def get_marts(self): """Get available marts and their names.""" url = "https://{host}/biomart/martservice?type=registry&requestid=gseapy{i}".format( host=self.host, i=self._id ) + if self._marts is not None: + return self._marts resp = requests.get(url) if resp.ok and resp.text.startswith("\n\n"): - marts = [e.attrib for e in ET.XML(resp.text)] - marts = pd.DataFrame(marts) - marts = marts.loc[:, ["database", "displayName", "name"]] - marts.columns = ["Version", "DisplayName", "Mart"] - # get supported marts - self._marts = marts["Mart"].to_list() - return marts.loc[:, ["Mart", "Version"]] + self._marts = self._get_mart(resp.text) + return self._marts return resp.text def get_datasets(self, mart: str = "ENSEMBL_MART_ENSEMBL"): """Get available datasets from mart you've selected""" - if self._marts is None: - self.get_marts() - if mart not in self._marts: + + marts = self.get_marts() + if mart not in marts["Mart"].values: raise ValueError( "Provided mart name (%s) is not valid. see 'names' attribute" % mart )