From 6bca1f0b8a84f812cefa81a598a0cd30876b2a45 Mon Sep 17 00:00:00 2001 From: zqfang Date: Mon, 23 Oct 2023 16:38:27 -0700 Subject: [PATCH] refactor --- gseapy/__init__.py | 17 ++-- gseapy/__main__.py | 2 +- gseapy/gsea.py | 200 ------------------------------------------ gseapy/ssgsea.py | 214 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 224 insertions(+), 209 deletions(-) create mode 100644 gseapy/ssgsea.py diff --git a/gseapy/__init__.py b/gseapy/__init__.py index da7409d..d8b8df8 100644 --- a/gseapy/__init__.py +++ b/gseapy/__init__.py @@ -6,11 +6,12 @@ from .__main__ import __version__ from .biomart import Biomart from .enrichr import Enrichr -from .gsea import GSEA, Prerank, Replot, SingleSampleGSEA +from .gsea import GSEA, Prerank, Replot from .gsva import GSVA from .msigdb import Msigdb from .parser import get_library, get_library_name, read_gmt from .plot import barplot, dotplot, enrichment_map, gseaplot, gseaplot2, heatmap +from .ssgsea import SingleSampleGSEA def gsea( @@ -162,8 +163,8 @@ def ssgsea( data: Union[pd.Series, pd.DataFrame, str], gene_sets: Union[List[str], str, Dict[str, str]], outdir: Optional[str] = None, - sample_norm_method: str = "rank", - correl_norm_type: str = "rank", + sample_norm_method: Optional[str] = "rank", + correl_norm_type: Optional[str] = "rank", min_size: int = 15, max_size: int = 500, permutation_num: Optional[int] = None, @@ -187,20 +188,20 @@ def ssgsea( :param outdir: Results output directory. If None, nothing will write to disk. - :param str sample_norm_method: Sample normalization method. Choose from {'rank', 'log', 'log_rank'}. Default: rank. + :param str sample_norm_method: Sample normalization method. Choose from {'rank', 'log', 'log_rank', None}. Default: rank. this argument will be used for ordering genes. 1. 'rank': Rank your expression data, and transform by 10000*rank_dat/gene_numbers 2. 'log' : Do not rank, but transform data by log(data + exp(1)), while data = data[data<1] =1. 3. 'log_rank': Rank your expression data, and transform by log(10000*rank_dat/gene_numbers+ exp(1)) - 4. 'custom': Do nothing, and use your own rank value to calculate enrichment score. + 4. None or 'custom': Do nothing, and use your own rank value to calculate enrichment score. see here: https://github.com/GSEA-MSigDB/ssGSEAProjection-gpmodule/blob/master/src/ssGSEAProjection.Library.R, line 86 - :param str correl_norm_type: correlation normalization type. Choose from {'rank', 'symrank', 'zscore'}. Default: rank. + :param str correl_norm_type: correlation normalization type. Choose from {'rank', 'symrank', 'zscore', None}. Default: rank. After ordering genes by sample_norm_method, further data transformed could be applied to get enrichment score. - when weight==0, sample_norm_method and correl_norm_type do not matter; + when weight == 0, sample_norm_method and correl_norm_type do not matter; when weight > 0, the combination of sample_norm_method and correl_norm_type dictate how the gene expression values in input data are transformed to obtain the score -- use this setting with care (the transformations @@ -209,7 +210,7 @@ def ssgsea( sample_norm_method will first transformed and rank original data. the data is named correl_vector for each sample. then correl_vector is transformed again by - 1. correl_norm_type =='rank': do nothing, genes are weighted by actual correl_vector. + 1. correl_norm_type is None or 'rank' : do nothing, genes are weighted by actual correl_vector. 2. correl_norm_type =='symrank': symmetric ranking. 3. correl_norm_type =='zscore': standardizes the correl_vector before using them to calculate scores. diff --git a/gseapy/__main__.py b/gseapy/__main__.py index 33160c7..9a9d68f 100644 --- a/gseapy/__main__.py +++ b/gseapy/__main__.py @@ -85,7 +85,7 @@ def main(): pre.run() elif subcommand == "ssgsea": - from .gsea import SingleSampleGSEA + from .ssgsea import SingleSampleGSEA ss = SingleSampleGSEA( data=args.data, diff --git a/gseapy/gsea.py b/gseapy/gsea.py index 7da5eba..bf0cf7c 100644 --- a/gseapy/gsea.py +++ b/gseapy/gsea.py @@ -13,13 +13,10 @@ from gseapy.base import GSEAbase from gseapy.gse import ( - CorrelType, Metric, gsea_rs, prerank2d_rs, prerank_rs, - ssgsea_rs, - gsva_rs, ) from gseapy.parser import gsea_cls_parser from gseapy.plot import gseaplot @@ -481,203 +478,6 @@ def run(self): return -class SingleSampleGSEA(GSEAbase): - """GSEA extension: single sample GSEA""" - - def __init__( - self, - data: Union[pd.DataFrame, pd.Series, str], - gene_sets: Union[List[str], str, Dict[str, str]], - outdir: Optional[str] = None, - sample_norm_method: str = "rank", - correl_norm_type: str = "zscore", - min_size: int = 15, - max_size: int = 500, - permutation_num: Optional[int] = None, - weight: float = 0.25, - ascending: bool = False, - threads: int = 1, - figsize: Tuple[float, float] = (6.5, 6), - format: str = "pdf", - graph_num: int = 20, - no_plot: bool = True, - seed: int = 123, - verbose: bool = False, - **kwargs, - ): - super(SingleSampleGSEA, self).__init__( - outdir=outdir, - gene_sets=gene_sets, - module="ssgsea", - threads=threads, - verbose=verbose, - ) - self.data = data - self.sample_norm_method = sample_norm_method - self.correl_type = self.norm_correl(correl_norm_type) - self.weight = weight - self.min_size = min_size - self.max_size = max_size - self.permutation_num = int(permutation_num) if permutation_num else None - self.ascending = ascending - self.figsize = figsize - self.format = format - self.graph_num = int(graph_num) - self.seed = seed - self.ranking = None - self._noplot = no_plot - self.permutation_type = "gene_set" - - def corplot(self): - """NES Correlation plot - TODO - """ - - def setplot(self): - """ranked genes' location plot - TODO - """ - - def load_data(self) -> pd.DataFrame: - # load data - exprs = self.data - if isinstance(exprs, pd.DataFrame): - rank_metric = exprs.copy() - # handle dataframe with gene_name as index. - self._logger.debug("Input data is a DataFrame with gene names") - # handle index is not gene_names - if rank_metric.index.dtype != "O": - rank_metric.set_index(keys=rank_metric.columns[0], inplace=True) - if rank_metric.columns.dtype != "O": - rank_metric.columns = rank_metric.columns.astype(str) - - rank_metric = rank_metric.select_dtypes(include=[np.number]) - elif isinstance(exprs, pd.Series): - # change to DataFrame - self._logger.debug("Input data is a Series with gene names") - if exprs.name is None: - # rename col if name attr is none - exprs.name = "sample1" - elif exprs.name.dtype != "O": - exprs.name = exprs.name.astype(str) - rank_metric = exprs.to_frame() - elif os.path.isfile(exprs): - # GCT input format? - if exprs.endswith("gct"): - rank_metric = pd.read_csv( - exprs, skiprows=1, comment="#", index_col=0, sep="\t" - ) - else: - # just txt file like input - sep = "\t" - if exprs.endswith("csv"): - sep = "," - rank_metric = pd.read_csv(exprs, comment="#", index_col=0, sep=sep) - if rank_metric.shape[1] == 1: - # rnk file like input - rank_metric.columns = rank_metric.columns.astype(str) - # select numbers - rank_metric = rank_metric.select_dtypes(include=[np.number]) - else: - raise Exception("Error parsing gene ranking values!") - - if rank_metric.isnull().any().sum() > 0: - self._logger.warning("Input data contains NA, filled NA with 0") - rank_metric = rank_metric.fillna(0) - - if rank_metric.index.duplicated().sum() > 0: - self._logger.warning( - "Found duplicated gene names, values averaged by gene names!" - ) - rank_metric = rank_metric.groupby(level=0).mean() - return rank_metric - - def norm_samples(self, dat: pd.DataFrame) -> pd.DataFrame: - """normalization samples - see here: https://github.com/broadinstitute/ssGSEA2.0/blob/f682082f62ae34185421545f25041bae2c78c89b/src/ssGSEA2.0.R#L237 - """ - - if self.sample_norm_method == "rank": - data = dat.rank(axis=0, method="average", na_option="bottom") - data = 10000 * data / data.shape[0] - elif self.sample_norm_method == "log_rank": - data = dat.rank(axis=0, method="average", na_option="bottom") - data = np.log(10000 * data / data.shape[0] + np.exp(1)) - elif self.sample_norm_method == "log": - dat[dat < 1] = 1 - data = np.log(dat + np.exp(1)) - elif self.sample_norm_method == "custom": - self._logger.info("Use custom rank metric for ssGSEA") - data = dat - else: - raise Exception("No supported method: %s" % self.sample_norm_method) - - return data - - def norm_correl(self, cortype): - """ - After norm_samples, input value will be further nomizalized before using them to calculate scores. - see source code: - https://github.com/broadinstitute/ssGSEA2.0/blob/f682082f62ae34185421545f25041bae2c78c89b/src/ssGSEA2.0.R#L396 - """ - if cortype == "zscore": - return CorrelType.ZScore - elif cortype == "rank": - return CorrelType.Rank - elif cortype == "symrank": - return CorrelType.SymRank - else: - raise Exception("unsupported correl type input") - - def run(self): - """run entry""" - if self.permutation_num is None: - self.permutation_num = 0 - self._logger.info("Parsing data files for ssGSEA...........................") - # load data - data = self.load_data() - # normalized samples, and rank - normdat = self.norm_samples(data) - self.ranking = normdat - # filtering out gene sets and build gene sets dictionary - gmt = self.load_gmt(gene_list=normdat.index.values, gmt=self.gene_sets) - self.gmt = gmt - self._logger.info( - "%04d gene_sets used for further statistical testing....." % len(gmt) - ) - # start analysis - self._logger.info("Start to run ssGSEA...Might take a while................") - if self.permutation_num > 0: - # run permutation procedure and calculate pvals, fdrs - self._logger.warning( - "Run ssGSEA with permutation procedure, don't use the pval, fdr results for publication." - ) - self.runSamplesPermu(df=normdat, gmt=gmt) - - def runSamplesPermu( - self, df: pd.DataFrame, gmt: Optional[Dict[str, List[str]]] = None - ): - """Single Sample GSEA workflow with permutation procedure""" - - assert self.min_size <= self.max_size - if self._outdir: - mkdirs(self.outdir) - gsum = ssgsea_rs( - df.index.values.tolist(), - df.values.tolist(), - gmt, - self.weight, - self.min_size, - self.max_size, - self.permutation_num, # permutate just like runing prerank analysis - self.correl_type, - self._threads, - self.seed, - ) - self.to_df(gsum.summaries, gmt, df, gsum.indices) - return - - class Replot(GSEAbase): """To reproduce GSEA desktop output results.""" diff --git a/gseapy/ssgsea.py b/gseapy/ssgsea.py new file mode 100644 index 0000000..168b57d --- /dev/null +++ b/gseapy/ssgsea.py @@ -0,0 +1,214 @@ +#! python +# -*- coding: utf-8 -*- +import os +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +import pandas as pd + +from gseapy.base import GSEAbase +from gseapy.gse import CorrelType, ssgsea_rs +from gseapy.utils import mkdirs + + +class SingleSampleGSEA(GSEAbase): + """GSEA extension: single sample GSEA""" + + def __init__( + self, + data: Union[pd.DataFrame, pd.Series, str], + gene_sets: Union[List[str], str, Dict[str, str]], + outdir: Optional[str] = None, + sample_norm_method: Optional[str] = "rank", + correl_norm_type: Optional[str] = None, + min_size: int = 15, + max_size: int = 500, + permutation_num: Optional[int] = None, + weight: float = 0.25, + ascending: bool = False, + threads: int = 1, + figsize: Tuple[float, float] = (6.5, 6), + format: str = "pdf", + graph_num: int = 20, + no_plot: bool = True, + seed: int = 123, + verbose: bool = False, + **kwargs, + ): + super(SingleSampleGSEA, self).__init__( + outdir=outdir, + gene_sets=gene_sets, + module="ssgsea", + threads=threads, + verbose=verbose, + ) + self.data = data + self.sample_norm_method = sample_norm_method + self.correl_type = self.norm_correl(correl_norm_type) + self.weight = weight + self.min_size = min_size + self.max_size = max_size + self.permutation_num = int(permutation_num) if permutation_num else None + self.ascending = ascending + self.figsize = figsize + self.format = format + self.graph_num = int(graph_num) + self.seed = seed + self.ranking = None + self._noplot = no_plot + self.permutation_type = "gene_set" + + def corplot(self): + """NES Correlation plot + TODO + """ + + def setplot(self): + """ranked genes' location plot + TODO + """ + + def load_data(self) -> pd.DataFrame: + # load data + exprs = self.data + if isinstance(exprs, pd.DataFrame): + rank_metric = exprs.copy() + # handle dataframe with gene_name as index. + self._logger.debug("Input data is a DataFrame with gene names") + # handle index is not gene_names + if rank_metric.index.dtype != "O": + rank_metric.set_index(keys=rank_metric.columns[0], inplace=True) + if rank_metric.columns.dtype != "O": + rank_metric.columns = rank_metric.columns.astype(str) + + rank_metric = rank_metric.select_dtypes(include=[np.number]) + elif isinstance(exprs, pd.Series): + # change to DataFrame + self._logger.debug("Input data is a Series with gene names") + if exprs.name is None: + # rename col if name attr is none + exprs.name = "sample1" + elif exprs.name.dtype != "O": + exprs.name = exprs.name.astype(str) + rank_metric = exprs.to_frame() + elif os.path.isfile(exprs): + # GCT input format? + if exprs.endswith("gct"): + rank_metric = pd.read_csv( + exprs, skiprows=1, comment="#", index_col=0, sep="\t" + ) + else: + # just txt file like input + sep = "\t" + if exprs.endswith("csv"): + sep = "," + rank_metric = pd.read_csv(exprs, comment="#", index_col=0, sep=sep) + if rank_metric.shape[1] == 1: + # rnk file like input + rank_metric.columns = rank_metric.columns.astype(str) + # select numbers + rank_metric = rank_metric.select_dtypes(include=[np.number]) + else: + raise Exception("Error parsing gene ranking values!") + + if rank_metric.isnull().any().sum() > 0: + self._logger.warning("Input data contains NA, filled NA with 0") + rank_metric = rank_metric.fillna(0) + + if rank_metric.index.duplicated().sum() > 0: + self._logger.warning( + "Found duplicated gene names, values averaged by gene names!" + ) + rank_metric = rank_metric.groupby(level=0).mean() + return rank_metric + + def norm_samples(self, dat: pd.DataFrame) -> pd.DataFrame: + """normalization samples + see here: https://github.com/broadinstitute/ssGSEA2.0/blob/f682082f62ae34185421545f25041bae2c78c89b/src/ssGSEA2.0.R#L237 + """ + if self.sample_norm_method is None: + self._logger.info("Use input as rank metric for ssGSEA") + return dat + + if self.sample_norm_method == "rank": + data = dat.rank(axis=0, method="average", na_option="bottom") + data = 10000 * data / data.shape[0] + elif self.sample_norm_method == "log_rank": + data = dat.rank(axis=0, method="average", na_option="bottom") + data = np.log(10000 * data / data.shape[0] + np.exp(1)) + elif self.sample_norm_method == "log": + dat[dat < 1] = 1 + data = np.log(dat + np.exp(1)) + elif self.sample_norm_method == "custom": + self._logger.info("Use custom rank metric for ssGSEA") + data = dat + else: + raise Exception("No supported method: %s" % self.sample_norm_method) + + return data + + def norm_correl(self, cortype): + """ + After norm_samples, input value will be further nomizalized before using them to calculate scores. + see source code: + https://github.com/broadinstitute/ssGSEA2.0/blob/f682082f62ae34185421545f25041bae2c78c89b/src/ssGSEA2.0.R#L396 + """ + if cortype is None: + return CorrelType.Rank + + if cortype == "zscore": + return CorrelType.ZScore + elif cortype == "rank": + return CorrelType.Rank + elif cortype == "symrank": + return CorrelType.SymRank + else: + raise Exception("unsupported correl type input") + + def run(self): + """run entry""" + if self.permutation_num is None: + self.permutation_num = 0 + self._logger.info("Parsing data files for ssGSEA...........................") + # load data + data = self.load_data() + # normalized samples, and rank + normdat = self.norm_samples(data) + self.ranking = normdat + # filtering out gene sets and build gene sets dictionary + gmt = self.load_gmt(gene_list=normdat.index.values, gmt=self.gene_sets) + self.gmt = gmt + self._logger.info( + "%04d gene_sets used for further statistical testing....." % len(gmt) + ) + # start analysis + self._logger.info("Start to run ssGSEA...Might take a while................") + if self.permutation_num > 0: + # run permutation procedure and calculate pvals, fdrs + self._logger.warning( + "Run ssGSEA with permutation procedure, don't use the pval, fdr results for publication." + ) + self.runSamplesPermu(df=normdat, gmt=gmt) + + def runSamplesPermu( + self, df: pd.DataFrame, gmt: Optional[Dict[str, List[str]]] = None + ): + """Single Sample GSEA workflow with permutation procedure""" + + assert self.min_size <= self.max_size + if self._outdir: + mkdirs(self.outdir) + gsum = ssgsea_rs( + df.index.values.tolist(), + df.values.tolist(), + gmt, + self.weight, + self.min_size, + self.max_size, + self.permutation_num, # permutate just like runing prerank analysis + self.correl_type, + self._threads, + self.seed, + ) + self.to_df(gsum.summaries, gmt, df, gsum.indices) + return