From 45c3698cf2cb2ec84c6f4be17a1c4a4d81b6f7ce Mon Sep 17 00:00:00 2001 From: Zhuoqing Fang Date: Tue, 3 Dec 2024 15:46:01 -0800 Subject: [PATCH] feature added, ##289, 1-3. Now a cls file will write when call gsea func; gene names will implicity convert to uppercase to match GMT file (same to enrichr server); add save keyword to func to save GMT file --- docs/conf.py | 6 ++-- gseapy/__main__.py | 2 +- gseapy/base.py | 85 +++++++++++++++++++++++++++++--------------- gseapy/enrichr.py | 88 +++++++++++++++++++++++++++++++++------------- gseapy/gsea.py | 82 +++++++++++++++++++++++++++++++++++++++--- gseapy/gsva.py | 7 +++- gseapy/msigdb.py | 7 ++-- gseapy/parser.py | 23 ++++++++++-- gseapy/ssgsea.py | 9 ++++- 9 files changed, 241 insertions(+), 68 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 73e6c47..4097027 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -57,7 +57,7 @@ # General information about the project. project = "GSEApy" -copyright = "2017-2022, Zhuoqing Fang" +copyright = "2017-2025, Zhuoqing Fang" author = "Zhuoqing Fang" # The version info for the project you're documenting, acts as replacement for @@ -65,9 +65,9 @@ # built documents. # # The short X.Y version. -version = "1.1.4" +version = "1.1.5" # The full version, including alpha/beta/rc tags. -release = "1.1.4" +release = "1.1.5" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/gseapy/__main__.py b/gseapy/__main__.py index 3e2aac3..1821187 100644 --- a/gseapy/__main__.py +++ b/gseapy/__main__.py @@ -11,7 +11,7 @@ # or args = argparser.parse_args() will throw bugs!!! -__version__ = "1.1.4" +__version__ = "1.1.5" def main(): diff --git a/gseapy/base.py b/gseapy/base.py index e44d82f..162a33d 100644 --- a/gseapy/base.py +++ b/gseapy/base.py @@ -13,63 +13,63 @@ from gseapy.utils import DEFAULT_CACHE_PATH, log_init, mkdirs, retry -class GMT(dict): +class GMT: def __init__( self, mapping: Optional[Dict[str, str]] = None, description: Optional[str] = None, + source: Optional[str] = None, ): """ wrapper of dict. this helps merge multiple dict into one the original key will changed to new key with suffix '__{description}' """ - if description is None: - description = "" self.description = description - _mapping = {} + self.source = source + self._mapping = {} if mapping is not None: - for key, value in mapping.items(): - k = key + "__" + self.description - _mapping[k] = value - super().__init__(_mapping) + self.update(mapping) + + def update(self, mapping: Dict[str, str]): + """ + update the mapping in place + """ + for key, value in mapping.items(): + k = key + "__" + self.description if self.description else key + self._mapping[k] = value def apply(self, func): """apply function in place""" - for key, value in self.items(): - self[key] = func(value) + for key, value in self._mapping.items(): + self._mapping[key] = func(value) def is_empty(self): - return len(self) == 0 + return len(self._mapping) == 0 def write(self, ofname: str): """ write gmt file to disk """ with open(ofname, "w") as out: - for key, value in self.items(): + for key, value in self._mapping.items(): collections = key.split("__") collections += list(value) out.write("\t".join(collections) + "\n") - def _read(path): - mapping = {} - with open(path, "r") as inp: - for line in inp: - items = line.strip().split("\t") - key = items[0] - if items[1] != "": - key += "__" + items[1] - mapping[key] = items[2:] - return mapping - @classmethod - def read(cls, paths): + def read(cls, paths, source=None): paths = paths.strip().split(",") # mapping mapping = {} for path in paths: - mapping.update(cls._read(path)) - return cls(mapping) + with open(path, "r") as inp: + for line in inp: + items = line.strip().split("\t") + key = items[0] + if items[1] != "": + key += "__" + items[1] + mapping[key] = items[2:] + return cls(mapping, source=source) class GSEAbase(object): @@ -98,7 +98,8 @@ def __init__( self.pheno_neg = "" self.permutation_num = 0 self._LIBRARY_LIST_URL = "https://maayanlab.cloud/speedrichr/api/listlibs" - + self._gene_isupper = True + self._gene_toupper = False self._set_cores() # init logger self.prepare_outdir() @@ -238,6 +239,25 @@ def _check_data(self, exprs: pd.DataFrame) -> pd.DataFrame: df = df.replace({col: col_min_max for col in df.columns}) return df + def check_uppercase(self, gene_list: List[str]): + """ + Check whether a list of gene names are mostly in uppercase. + + Parameters + ---------- + gene_list : list + A list of gene names + + Returns + ------- + bool + Whether the list of gene names are mostly in uppercase + """ + is_upper = [s.isupper() for s in gene_list] + if sum(is_upper) / len(is_upper) >= 0.9: + return True + return False + def make_unique(self, rank_metric: pd.DataFrame, col_idx: int) -> pd.DataFrame: """ make gene id column unique by adding a digit, similar to R's make.unique @@ -309,14 +329,23 @@ def load_gmt( subsets = list(genesets_dict.keys()) entry1st = genesets_dict[subsets[0]] gene_dict = {g: i for i, g in enumerate(gene_list)} + if not self._gene_isupper: + gene_dict_upper = {g.upper(): i for i, g in enumerate(gene_list)} for subset in subsets: subset_list = set(genesets_dict.get(subset)) # remove duplicates # drop genes not found in the gene_dict gene_overlap = [g for g in subset_list if g in gene_dict] - genesets_dict[subset] = gene_overlap + # try uppercase version of gene symbols if overlap is too small + if (not self._gene_isupper) and len(gene_overlap) < self.min_size: + gene_overlap2 = [g for g in subset_list if g in gene_dict_upper] + # set flag to True, means use uppercase version of gene symbols + if len(gene_overlap2) > len(gene_overlap): + gene_overlap = gene_overlap2 + self._gene_toupper = True tag_len = len(gene_overlap) if (self.min_size <= tag_len <= self.max_size) and tag_len < len(gene_list): # tag_len should < gene_list + genesets_dict[subset] = gene_overlap continue del genesets_dict[subset] diff --git a/gseapy/enrichr.py b/gseapy/enrichr.py index 01b892f..7b29574 100644 --- a/gseapy/enrichr.py +++ b/gseapy/enrichr.py @@ -54,6 +54,8 @@ def __init__( self._organism = None self.ENRICHR_URL = "http://maayanlab.cloud" self.ENRICHR_URL_SPEED = "https://maayanlab.cloud/speedrichr" + self._gene_isupper = True + self._gene_toupper = False # init logger self.prepare_outdir() @@ -168,17 +170,17 @@ def parse_genelists(self) -> str: genes = ( self.gene_list.iloc[:, :3] .apply(lambda x: "\t".join([str(i) for i in x]), axis=1) - .tolist() + .to_list() ) # input type with weight values elif self.gene_list.shape[1] == 2: genes = self.gene_list.apply( lambda x: ",".join([str(i) for i in x]), axis=1 - ).tolist() + ).to_list() else: - genes = self.gene_list.squeeze().tolist() + genes = self.gene_list.squeeze().to_list() elif isinstance(self.gene_list, pd.Series): - genes = self.gene_list.squeeze().tolist() + genes = self.gene_list.squeeze().to_list() else: # get gene lists or bed file, or gene list with weighted values. genes = [] @@ -189,6 +191,8 @@ def parse_genelists(self) -> str: if not genes: raise ValueError("Gene list cannot be empty") + # convert to list and remove whitespace + genes = [g.strip() for g in genes] self._isezid = all(map(self._is_entrez_id, genes)) if self._isezid: self._gls = set(map(int, genes)) @@ -245,6 +249,25 @@ def check_genes(self, gene_list: List[str], usr_list_id: str): "{} genes successfully recognized by Enrichr".format(returnedN) ) + def check_uppercase(self, gene_list: List[str]): + """ + Check whether a list of gene names are mostly in uppercase. + + Parameters + ---------- + gene_list : list + A list of gene names + + Returns + ------- + bool + Whether the list of gene names are mostly in uppercase + """ + is_upper = [s.isupper() for s in gene_list] + if sum(is_upper) / len(is_upper) >= 0.9: + return True + return False + def get_results_with_background( self, gene_list: List[str], background: List[str] ) -> Tuple[AnyStr, pd.DataFrame]: @@ -493,8 +516,6 @@ def parse_background(self, gmt: Dict[str, List[str]] = None): """ set background genes """ - if hasattr(self, "_bg") and self._bg: - return self._bg self._bg = set() if self.background is None: @@ -504,7 +525,7 @@ def parse_background(self, gmt: Dict[str, List[str]] = None): for term, genes in gmt.items(): bg = bg.union(set(genes)) self._logger.info( - "Background is not set! Use all %s genes in %s." + " Background is not set! Use all %s genes in %s." % (len(bg), self._gs) ) self._bg = bg @@ -514,7 +535,7 @@ def parse_background(self, gmt: Dict[str, List[str]] = None): elif isinstance(self.background, str): # self.background = set(reduce(lambda x,y: x+y, gmt.values(),[])) self._bg = self.get_background() - self._logger.info("Background: found %s genes" % (len(self._bg))) + self._logger.info(" Background: found %s genes" % (len(self._bg))) else: raise Exception("Unsupported background data type") else: @@ -523,7 +544,7 @@ def parse_background(self, gmt: Dict[str, List[str]] = None): it = iter(self.background) self._bg = set(self.background) except TypeError: - self._logger.error("Unsupported background data type") + self._logger.error(" Unsupported background data type") return self._bg @@ -541,9 +562,29 @@ def enrich(self, gmt: Dict[str, List[str]]): Term Overlap P-value Odds Ratio Combinde Score Adjusted_P-value Genes """ + # check gene cases + # self._gene_isupper = self.check_uppercase(self._gls) + top10 = list(gmt.keys())[:10] + _gls = self._gls + ups = [] + _gene_toupper = False + for key in top10: + ups.append(self.check_uppercase(gmt[key])) + if all(ups) and (not self._gene_isupper): + _gls = [x.upper() for x in self._gls] + _gene_toupper = True # set flag + self._logger.info( + " Genes in GMT file are all in upper case, convert query to upper case." + ) + bg = self.parse_background(gmt) + self._logger.info(_gls[1:5]) + self._logger.info(list(bg)[1:5]) + if isinstance(bg, set) and (not self._gene_isupper) and _gene_toupper: + bg = {s.upper() for s in bg} # convert bg to upper case, too + # self._logger.info(" Background is converted to upper case: %s" % list(bg)[1:5]) # statistical testing - hgtest = list(calc_pvalues(query=self._gls, gene_sets=gmt, background=bg)) + hgtest = list(calc_pvalues(query=_gls, gene_sets=gmt, background=bg)) if len(hgtest) > 0: terms, pvals, oddr, olsz, gsetsz, genes = hgtest fdrs, rej = multiple_testing_correction( @@ -567,7 +608,8 @@ def run(self): """run enrichr for one sample gene list but multi-libraries""" # read input file - genes_list = self.parse_genelists() + genes_list = self.parse_genelists() # + self._gene_isupper = self.check_uppercase(self._gls) # self._logger.info("Connecting to Enrichr Server to get latest library names") gss = self.parse_genesets() @@ -586,39 +628,34 @@ def run(self): for name, g in zip(self._gs_name, gss): self._logger.info("Run: %s " % name) + self._gs = name if isinstance(g, dict): - ## local mode - shortID, self._gs = str(id(g)), name + ## local mode, need to check gene cases self._logger.debug( - "Off-line enrichment analysis with library: %s" % (self._gs) + "Off-line enrichment analysis with library: %s" % (name) ) if self._isezid: g = {k: list(map(int, v)) for k, v in g.items()} res = self.enrich(g) if res is None: - self._logger.info( - "No hits return, for gene set: Custom%s" % shortID - ) + self._logger.info("No hits returned for library: %s" % name) continue else: - ## online mode - self._gs = name + ## online mode, no need to check gene cases self._logger.debug("Enrichr service using library: %s" % (name)) # self._logger.info("Enrichr Library: %s"% self._gs) bg = self.parse_background() # whether user input background if isinstance(bg, set) and len(bg) > 0: - shortID, res = self.get_results_with_background( - genes_list, self._bg - ) + shortID, res = self.get_results_with_background(genes_list, bg) else: shortID, res = self.get_results(genes_list) # Remember gene set library used res.insert(0, "Gene_set", name) - # Append to master dataframe + # Append to results self.results.append(res) - self.res2d = res + self.res2d = res # save last result if self._outdir is None: continue outfile = "%s/%s.%s.%s.reports.txt" % ( @@ -643,6 +680,9 @@ def run(self): ofname=outfile.replace("txt", self.format), ) self._logger.debug("Generate figures") + if len(self.results) == 0: + self._logger.error("No hits returned for all input gene sets!") + return self.results = pd.concat(self.results, ignore_index=True) self._logger.info("Done.") diff --git a/gseapy/gsea.py b/gseapy/gsea.py index 3105702..1e450a7 100644 --- a/gseapy/gsea.py +++ b/gseapy/gsea.py @@ -43,6 +43,59 @@ def __init__( seed: int = 123, verbose: bool = False, ): + """ + GSEA main tool. + + Parameters + ---------- + data : pandas.DataFrame or str + Data table with genes in the index and samples in the columns. + If str, the file path to the data file (e.g. .txt, .csv, .tsv). + gene_sets : list or str or dict + Gene sets file path or list of gene sets. + If str, the file path to the gene sets file (e.g. .gmt, .txt, .csv). + If dict, a dictionary of gene sets. + classes : list or str or dict + Phenotype labels file path or list of phenotype labels. + If str, the file path to the phenotype labels file (e.g. .txt, .csv, .tsv). + If dict, a dictionary of phenotype labels. + outdir : str, optional + Output directory. If not provided, the output will be saved. + min_size : int, optional + Minimum number of genes in a gene set. Default is 15. + max_size : int, optional + Maximum number of genes in a gene set. Default is 500. + permutation_num : int, optional + Number of permutations. Default is 1000. + weight : float, optional + Weighting factor used in the calculation of the ES. Default is 1.0. + permutation_type : str, optional + Type of permutation, either 'phenotype' or 'gene_set'. Default is 'phenotype'. + method : str, optional + Metric used in the calculation of the ES. Default is 'signal_to_noise'. + ascending : bool, optional + If True, the genes will be ranked in ascending order. Default is False. + threads : int, optional + Number of threads to use. Default is 1. + figsize : tuple, optional + Figure size for the enrichment plot. Default is (6.5, 6). + format : str, optional + Format of the output file. Default is 'pdf'. + graph_num : int, optional + Number of enrichment plots to generate. Default is 20. + no_plot : bool, optional + If True, no enrichment plots will be generated. Default is False. + seed : int, optional + Random seed for the permutation test. Default is 123. + verbose : bool, optional + If True, the progress bar will be shown. Default is False. + + Examples + -------- + >>> from gseapy import GSEA + >>> gsea = GSEA(data='expression.txt', gene_sets='gene_sets.gmt', classes='test.cls') + >>> gsea.run() + """ super(GSEA, self).__init__( outdir=outdir, gene_sets=gene_sets, @@ -255,6 +308,16 @@ def load_classes(self, classes: Union[str, List[str], Dict[str, Any]]): self.pheno_neg = neg self.groups = cls_vector + if self._outdir is not None: + self.to_cls(outdir=self.outdir) + + def to_cls(self, outdir: str): + """Save group information to cls file""" + with open(os.path.join(outdir, "group.cls"), "w") as f: + f.write(f"{len(self.groups)} 2 1\n") + f.write(f"# {self.pheno_pos} {self.pheno_neg}\n") + f.write(" ".join(self.groups) + "\n") + # @profile def run(self): """GSEA main procedure""" @@ -279,13 +342,19 @@ def run(self): # select correct expression genes and values. dat, cls_dict = self.load_data() # filtering out gene sets and build gene sets dictionary + self._gene_isupper = self.check_uppercase(gene_list=dat.index.values) gmt = self.load_gmt(gene_list=dat.index.values, gmt=self.gene_sets) self.gmt = gmt self._logger.info( "%04d gene_sets used for further statistical testing....." % len(gmt) ) self._logger.info("Start to run GSEA...Might take a while..................") - # cpu numbers + # gene symbols + gene_names = dat.index.to_list() + if (not self._gene_isupper) and self._gene_toupper: + gene_names = [x.upper() for x in gene_names] + self._logger.info("Genes are converted to uppercase.") + # compute ES, NES, pval, FDR, RES if self.permutation_type == "gene_set": # ranking metrics calculation. @@ -298,7 +367,7 @@ def run(self): ascending=self.ascending, ) gsum = prerank_rs( - dat2.index.values.tolist(), # gene list + gene_names, # gene list dat2.squeeze().values.tolist(), # ranking values gmt, # must be a dict object self.weight, @@ -318,7 +387,7 @@ def run(self): map(lambda x: True if x == self.pheno_pos else False, self.groups) ) gsum = gsea_rs( - dat.index.values.tolist(), + gene_names, dat.values.tolist(), # each row is gene values across samples gmt, group, @@ -478,12 +547,17 @@ def run(self): # Start Analysis self._logger.info("Parsing data files for GSEA.............................") # filtering out gene sets and build gene sets dictionary + self._gene_isupper = self.check_uppercase(gene_list=dat2.index.values) gmt = self.load_gmt(gene_list=dat2.index.values, gmt=self.gene_sets) self.gmt = gmt self._logger.info( "%04d gene_sets used for further statistical testing....." % len(gmt) ) self._logger.info("Start to run GSEA...Might take a while..................") + gene_names = dat2.index.to_list() + if (not self._gene_isupper) and self._gene_toupper: + gene_names = [x.upper() for x in gene_names] + self._logger.info("Genes are converted to uppercase.") # compute ES, NES, pval, FDR, RES if isinstance(dat2, pd.DataFrame): _prerank = prerank2d_rs @@ -491,7 +565,7 @@ def run(self): _prerank = prerank_rs # run gsum = _prerank( - dat2.index.values.tolist(), # gene list + gene_names, # gene list dat2.values.tolist(), # ranking values gmt, # must be a dict object self.weight, diff --git a/gseapy/gsva.py b/gseapy/gsva.py index bc0e396..8f6be2c 100644 --- a/gseapy/gsva.py +++ b/gseapy/gsva.py @@ -95,6 +95,7 @@ def run(self): self.data = df # normalized samples, and rank # filtering out gene sets and build gene sets dictionary + self._gene_isupper = self.check_uppercase(gene_list=df.index.values) gmt = self.load_gmt(gene_list=df.index.values, gmt=self.gene_sets) self.gmt = gmt self._logger.info( @@ -102,9 +103,13 @@ def run(self): ) # start analysis self._logger.info("Start to run GSVA...Might take a while................") + gene_names = df.index.to_list() + if (not self._gene_isupper) and self._gene_toupper: + gene_names = [x.upper() for x in gene_names] + self._logger.info("Genes are converted to uppercase.") # run gsum = gsva_rs( - df.index.values.tolist(), + gene_names, df.values.tolist(), gmt, self.kernel, diff --git a/gseapy/msigdb.py b/gseapy/msigdb.py index 0af6781..1033703 100644 --- a/gseapy/msigdb.py +++ b/gseapy/msigdb.py @@ -1,8 +1,7 @@ import re - import pandas as pd import requests - +from io import StringIO class Msigdb: def __init__(self, dbver: str = "2023.1.Hs"): @@ -26,7 +25,7 @@ def _get_db_version(self): """ resp = requests.get(self.url) if resp.ok: - d = pd.read_html(resp.text)[0] + d = pd.read_html(StringIO(resp.text))[0] # remove item : parent dictory and NA columns d = d.dropna(how="all").iloc[1:, 1:3].reset_index(drop=True) d.iloc[:, 0] = d.iloc[:, 0].str.rstrip("/") @@ -92,7 +91,7 @@ def list_gmt(self, db: str): url = self.url + db resp = requests.get(url) if resp.ok: - d = pd.read_html(resp.text)[0] + d = pd.read_html(StringIO(resp.text))[0] # remove item : parent dictory and NA columns d = d.dropna(how="all").iloc[1:, 1:4] d = d[d.iloc[:, 0].str.match(self._pattern)] diff --git a/gseapy/parser.py b/gseapy/parser.py index f2b4102..f1d1153 100644 --- a/gseapy/parser.py +++ b/gseapy/parser.py @@ -94,6 +94,7 @@ def get_library( organism: str = "Human", min_size: int = 0, max_size: int = 2000, + save: Optional[str] = None, gene_list: Optional[List[str]] = None, ) -> Dict[str, List[str]]: """Parse gene_sets.gmt(gene set database) file or download from enrichr server. @@ -107,6 +108,8 @@ def get_library( :param min_size: Minimum allowed number of genes for each gene set. Default: 0. :param max_size: Maximum allowed number of genes for each gene set. Default: 2000. + :param str save: the path to save the filtered gene set database. + :param gene_list: if input a gene list, min and max overlapped genes between gene set and gene_list are kept. :return dict: Return a filtered gene set database dictionary. @@ -170,6 +173,13 @@ def get_library( + "Note: Gene names for gseapy is case sensitive." ) + if save is not None: + gmtout = open(save, "w") + for k, v in genesets_dict.items(): + outline = "%s\t%s\t%s\n" % (k, name, "\t".join(v)) + gmtout.write(outline) + gmtout.close() + return genesets_dict @@ -233,11 +243,14 @@ def get_library_name(organism: str = "Human") -> List[str]: return sorted(libs) -def download_library(name: str, organism: str = "human") -> Dict[str, List[str]]: +def download_library( + name: str, organism: str = "human", filename: str = None +) -> Dict[str, List[str]]: """download enrichr libraries. :param str name: the enrichr library name. see `gseapy.get_library_name()`. :param str organism: Select one from { 'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm' } + :param str filename: the file name to save if not None. :return dict: gene_sets of the enrichr library from selected organism @@ -299,7 +312,6 @@ def download_library(name: str, organism: str = "human") -> Dict[str, List[str]] ) # reformat to dict genesets_dict = {} - # outname = os.path.join(DEFAULT_CACHE_PATH, "enrichr.%s.gmt" % libname) for line in response.iter_lines(chunk_size=1024, decode_unicode="utf-8"): line = line.strip().split("\t") k = line[0] @@ -307,4 +319,11 @@ def download_library(name: str, organism: str = "human") -> Dict[str, List[str]] v = list(filter(lambda x: True if len(x) else False, v)) genesets_dict[k] = v + if filename is not None: + gmtout = open(filename, "w") + for k, v in genesets_dict.items(): + outline = "%s\t%s\t%s\n" % (k, name, "\t".join(v)) + gmtout.write(outline) + gmtout.close() + return genesets_dict diff --git a/gseapy/ssgsea.py b/gseapy/ssgsea.py index 25b8716..36ad553 100644 --- a/gseapy/ssgsea.py +++ b/gseapy/ssgsea.py @@ -127,6 +127,7 @@ def run(self): normdat = self.norm_samples(data) self.ranking = normdat # filtering out gene sets and build gene sets dictionary + self._gene_isupper = self.check_uppercase(gene_list=normdat.index.values) gmt = self.load_gmt(gene_list=normdat.index.values, gmt=self.gene_sets) self.gmt = gmt self._logger.info( @@ -149,8 +150,14 @@ def runSamplesPermu( assert self.min_size <= self.max_size if self._outdir: mkdirs(self.outdir) + + gene_names = df.index.to_list() + if (not self._gene_isupper) and self._gene_toupper: + gene_names = [x.upper() for x in gene_names] + self._logger.info("Genes are converted to uppercase.") + gsum = ssgsea_rs( - df.index.values.tolist(), + gene_names, df.values.tolist(), gmt, self.weight,