Skip to content

Commit

Permalink
feature added, ##289, 1-3. Now a cls file will write when call gsea f…
Browse files Browse the repository at this point in the history
…unc; gene names will implicity convert to uppercase to match GMT file (same to enrichr server); add save keyword to func to save GMT file
  • Loading branch information
Zhuoqing Fang authored and Zhuoqing Fang committed Dec 3, 2024
1 parent 1784b6b commit 45c3698
Show file tree
Hide file tree
Showing 9 changed files with 241 additions and 68 deletions.
6 changes: 3 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,17 @@

# General information about the project.
project = "GSEApy"
copyright = "2017-2022, Zhuoqing Fang"
copyright = "2017-2025, Zhuoqing Fang"
author = "Zhuoqing Fang"

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = "1.1.4"
version = "1.1.5"
# The full version, including alpha/beta/rc tags.
release = "1.1.4"
release = "1.1.5"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion gseapy/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# or args = argparser.parse_args() will throw bugs!!!


__version__ = "1.1.4"
__version__ = "1.1.5"


def main():
Expand Down
85 changes: 57 additions & 28 deletions gseapy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,63 +13,63 @@
from gseapy.utils import DEFAULT_CACHE_PATH, log_init, mkdirs, retry


class GMT(dict):
class GMT:
def __init__(
self,
mapping: Optional[Dict[str, str]] = None,
description: Optional[str] = None,
source: Optional[str] = None,
):
"""
wrapper of dict. this helps merge multiple dict into one
the original key will changed to new key with suffix '__{description}'
"""
if description is None:
description = ""
self.description = description
_mapping = {}
self.source = source
self._mapping = {}
if mapping is not None:
for key, value in mapping.items():
k = key + "__" + self.description
_mapping[k] = value
super().__init__(_mapping)
self.update(mapping)

def update(self, mapping: Dict[str, str]):
"""
update the mapping in place
"""
for key, value in mapping.items():
k = key + "__" + self.description if self.description else key
self._mapping[k] = value

def apply(self, func):
"""apply function in place"""
for key, value in self.items():
self[key] = func(value)
for key, value in self._mapping.items():
self._mapping[key] = func(value)

def is_empty(self):
return len(self) == 0
return len(self._mapping) == 0

def write(self, ofname: str):
"""
write gmt file to disk
"""
with open(ofname, "w") as out:
for key, value in self.items():
for key, value in self._mapping.items():
collections = key.split("__")
collections += list(value)
out.write("\t".join(collections) + "\n")

def _read(path):
mapping = {}
with open(path, "r") as inp:
for line in inp:
items = line.strip().split("\t")
key = items[0]
if items[1] != "":
key += "__" + items[1]
mapping[key] = items[2:]
return mapping

@classmethod
def read(cls, paths):
def read(cls, paths, source=None):
paths = paths.strip().split(",")
# mapping
mapping = {}
for path in paths:
mapping.update(cls._read(path))
return cls(mapping)
with open(path, "r") as inp:
for line in inp:
items = line.strip().split("\t")
key = items[0]
if items[1] != "":
key += "__" + items[1]
mapping[key] = items[2:]
return cls(mapping, source=source)


class GSEAbase(object):
Expand Down Expand Up @@ -98,7 +98,8 @@ def __init__(
self.pheno_neg = ""
self.permutation_num = 0
self._LIBRARY_LIST_URL = "https://maayanlab.cloud/speedrichr/api/listlibs"

self._gene_isupper = True
self._gene_toupper = False
self._set_cores()
# init logger
self.prepare_outdir()
Expand Down Expand Up @@ -238,6 +239,25 @@ def _check_data(self, exprs: pd.DataFrame) -> pd.DataFrame:
df = df.replace({col: col_min_max for col in df.columns})
return df

def check_uppercase(self, gene_list: List[str]):
"""
Check whether a list of gene names are mostly in uppercase.
Parameters
----------
gene_list : list
A list of gene names
Returns
-------
bool
Whether the list of gene names are mostly in uppercase
"""
is_upper = [s.isupper() for s in gene_list]
if sum(is_upper) / len(is_upper) >= 0.9:
return True
return False

def make_unique(self, rank_metric: pd.DataFrame, col_idx: int) -> pd.DataFrame:
"""
make gene id column unique by adding a digit, similar to R's make.unique
Expand Down Expand Up @@ -309,14 +329,23 @@ def load_gmt(
subsets = list(genesets_dict.keys())
entry1st = genesets_dict[subsets[0]]
gene_dict = {g: i for i, g in enumerate(gene_list)}
if not self._gene_isupper:
gene_dict_upper = {g.upper(): i for i, g in enumerate(gene_list)}
for subset in subsets:
subset_list = set(genesets_dict.get(subset)) # remove duplicates
# drop genes not found in the gene_dict
gene_overlap = [g for g in subset_list if g in gene_dict]
genesets_dict[subset] = gene_overlap
# try uppercase version of gene symbols if overlap is too small
if (not self._gene_isupper) and len(gene_overlap) < self.min_size:
gene_overlap2 = [g for g in subset_list if g in gene_dict_upper]
# set flag to True, means use uppercase version of gene symbols
if len(gene_overlap2) > len(gene_overlap):
gene_overlap = gene_overlap2
self._gene_toupper = True
tag_len = len(gene_overlap)
if (self.min_size <= tag_len <= self.max_size) and tag_len < len(gene_list):
# tag_len should < gene_list
genesets_dict[subset] = gene_overlap
continue
del genesets_dict[subset]

Expand Down
88 changes: 64 additions & 24 deletions gseapy/enrichr.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ def __init__(
self._organism = None
self.ENRICHR_URL = "http://maayanlab.cloud"
self.ENRICHR_URL_SPEED = "https://maayanlab.cloud/speedrichr"
self._gene_isupper = True
self._gene_toupper = False
# init logger
self.prepare_outdir()

Expand Down Expand Up @@ -168,17 +170,17 @@ def parse_genelists(self) -> str:
genes = (
self.gene_list.iloc[:, :3]
.apply(lambda x: "\t".join([str(i) for i in x]), axis=1)
.tolist()
.to_list()
)
# input type with weight values
elif self.gene_list.shape[1] == 2:
genes = self.gene_list.apply(
lambda x: ",".join([str(i) for i in x]), axis=1
).tolist()
).to_list()
else:
genes = self.gene_list.squeeze().tolist()
genes = self.gene_list.squeeze().to_list()
elif isinstance(self.gene_list, pd.Series):
genes = self.gene_list.squeeze().tolist()
genes = self.gene_list.squeeze().to_list()
else:
# get gene lists or bed file, or gene list with weighted values.
genes = []
Expand All @@ -189,6 +191,8 @@ def parse_genelists(self) -> str:
if not genes:
raise ValueError("Gene list cannot be empty")

# convert to list and remove whitespace
genes = [g.strip() for g in genes]
self._isezid = all(map(self._is_entrez_id, genes))
if self._isezid:
self._gls = set(map(int, genes))
Expand Down Expand Up @@ -245,6 +249,25 @@ def check_genes(self, gene_list: List[str], usr_list_id: str):
"{} genes successfully recognized by Enrichr".format(returnedN)
)

def check_uppercase(self, gene_list: List[str]):
"""
Check whether a list of gene names are mostly in uppercase.
Parameters
----------
gene_list : list
A list of gene names
Returns
-------
bool
Whether the list of gene names are mostly in uppercase
"""
is_upper = [s.isupper() for s in gene_list]
if sum(is_upper) / len(is_upper) >= 0.9:
return True
return False

def get_results_with_background(
self, gene_list: List[str], background: List[str]
) -> Tuple[AnyStr, pd.DataFrame]:
Expand Down Expand Up @@ -493,8 +516,6 @@ def parse_background(self, gmt: Dict[str, List[str]] = None):
"""
set background genes
"""
if hasattr(self, "_bg") and self._bg:
return self._bg

self._bg = set()
if self.background is None:
Expand All @@ -504,7 +525,7 @@ def parse_background(self, gmt: Dict[str, List[str]] = None):
for term, genes in gmt.items():
bg = bg.union(set(genes))
self._logger.info(
"Background is not set! Use all %s genes in %s."
" Background is not set! Use all %s genes in %s."
% (len(bg), self._gs)
)
self._bg = bg
Expand All @@ -514,7 +535,7 @@ def parse_background(self, gmt: Dict[str, List[str]] = None):
elif isinstance(self.background, str):
# self.background = set(reduce(lambda x,y: x+y, gmt.values(),[]))
self._bg = self.get_background()
self._logger.info("Background: found %s genes" % (len(self._bg)))
self._logger.info(" Background: found %s genes" % (len(self._bg)))
else:
raise Exception("Unsupported background data type")
else:
Expand All @@ -523,7 +544,7 @@ def parse_background(self, gmt: Dict[str, List[str]] = None):
it = iter(self.background)
self._bg = set(self.background)
except TypeError:
self._logger.error("Unsupported background data type")
self._logger.error(" Unsupported background data type")

return self._bg

Expand All @@ -541,9 +562,29 @@ def enrich(self, gmt: Dict[str, List[str]]):
Term Overlap P-value Odds Ratio Combinde Score Adjusted_P-value Genes
"""
# check gene cases
# self._gene_isupper = self.check_uppercase(self._gls)
top10 = list(gmt.keys())[:10]
_gls = self._gls
ups = []
_gene_toupper = False
for key in top10:
ups.append(self.check_uppercase(gmt[key]))
if all(ups) and (not self._gene_isupper):
_gls = [x.upper() for x in self._gls]
_gene_toupper = True # set flag
self._logger.info(
" Genes in GMT file are all in upper case, convert query to upper case."
)

bg = self.parse_background(gmt)
self._logger.info(_gls[1:5])
self._logger.info(list(bg)[1:5])
if isinstance(bg, set) and (not self._gene_isupper) and _gene_toupper:
bg = {s.upper() for s in bg} # convert bg to upper case, too
# self._logger.info(" Background is converted to upper case: %s" % list(bg)[1:5])
# statistical testing
hgtest = list(calc_pvalues(query=self._gls, gene_sets=gmt, background=bg))
hgtest = list(calc_pvalues(query=_gls, gene_sets=gmt, background=bg))
if len(hgtest) > 0:
terms, pvals, oddr, olsz, gsetsz, genes = hgtest
fdrs, rej = multiple_testing_correction(
Expand All @@ -567,7 +608,8 @@ def run(self):
"""run enrichr for one sample gene list but multi-libraries"""

# read input file
genes_list = self.parse_genelists()
genes_list = self.parse_genelists() #
self._gene_isupper = self.check_uppercase(self._gls)

# self._logger.info("Connecting to Enrichr Server to get latest library names")
gss = self.parse_genesets()
Expand All @@ -586,39 +628,34 @@ def run(self):

for name, g in zip(self._gs_name, gss):
self._logger.info("Run: %s " % name)
self._gs = name
if isinstance(g, dict):
## local mode
shortID, self._gs = str(id(g)), name
## local mode, need to check gene cases
self._logger.debug(
"Off-line enrichment analysis with library: %s" % (self._gs)
"Off-line enrichment analysis with library: %s" % (name)
)
if self._isezid:
g = {k: list(map(int, v)) for k, v in g.items()}
res = self.enrich(g)
if res is None:
self._logger.info(
"No hits return, for gene set: Custom%s" % shortID
)
self._logger.info("No hits returned for library: %s" % name)
continue
else:
## online mode
self._gs = name
## online mode, no need to check gene cases
self._logger.debug("Enrichr service using library: %s" % (name))
# self._logger.info("Enrichr Library: %s"% self._gs)
bg = self.parse_background()
# whether user input background
if isinstance(bg, set) and len(bg) > 0:
shortID, res = self.get_results_with_background(
genes_list, self._bg
)
shortID, res = self.get_results_with_background(genes_list, bg)
else:
shortID, res = self.get_results(genes_list)

# Remember gene set library used
res.insert(0, "Gene_set", name)
# Append to master dataframe
# Append to results
self.results.append(res)
self.res2d = res
self.res2d = res # save last result
if self._outdir is None:
continue
outfile = "%s/%s.%s.%s.reports.txt" % (
Expand All @@ -643,6 +680,9 @@ def run(self):
ofname=outfile.replace("txt", self.format),
)
self._logger.debug("Generate figures")
if len(self.results) == 0:
self._logger.error("No hits returned for all input gene sets!")
return
self.results = pd.concat(self.results, ignore_index=True)
self._logger.info("Done.")

Expand Down
Loading

0 comments on commit 45c3698

Please sign in to comment.