From 6bca1f0b8a84f812cefa81a598a0cd30876b2a45 Mon Sep 17 00:00:00 2001
From: zqfang <fangzq@stanford.edu>
Date: Mon, 23 Oct 2023 16:38:27 -0700
Subject: [PATCH] refactor

---
 gseapy/__init__.py |  17 ++--
 gseapy/__main__.py |   2 +-
 gseapy/gsea.py     | 200 ------------------------------------------
 gseapy/ssgsea.py   | 214 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 224 insertions(+), 209 deletions(-)
 create mode 100644 gseapy/ssgsea.py

diff --git a/gseapy/__init__.py b/gseapy/__init__.py
index da7409d..d8b8df8 100644
--- a/gseapy/__init__.py
+++ b/gseapy/__init__.py
@@ -6,11 +6,12 @@
 from .__main__ import __version__
 from .biomart import Biomart
 from .enrichr import Enrichr
-from .gsea import GSEA, Prerank, Replot, SingleSampleGSEA
+from .gsea import GSEA, Prerank, Replot
 from .gsva import GSVA
 from .msigdb import Msigdb
 from .parser import get_library, get_library_name, read_gmt
 from .plot import barplot, dotplot, enrichment_map, gseaplot, gseaplot2, heatmap
+from .ssgsea import SingleSampleGSEA
 
 
 def gsea(
@@ -162,8 +163,8 @@ def ssgsea(
     data: Union[pd.Series, pd.DataFrame, str],
     gene_sets: Union[List[str], str, Dict[str, str]],
     outdir: Optional[str] = None,
-    sample_norm_method: str = "rank",
-    correl_norm_type: str = "rank",
+    sample_norm_method: Optional[str] = "rank",
+    correl_norm_type: Optional[str] = "rank",
     min_size: int = 15,
     max_size: int = 500,
     permutation_num: Optional[int] = None,
@@ -187,20 +188,20 @@ def ssgsea(
 
     :param outdir: Results output directory. If None, nothing will write to disk.
 
-    :param str sample_norm_method: Sample normalization method. Choose from {'rank', 'log', 'log_rank'}. Default: rank.
+    :param str sample_norm_method: Sample normalization method. Choose from {'rank', 'log', 'log_rank', None}. Default: rank.
                this argument will be used for ordering genes.
 
                1. 'rank': Rank your expression data, and transform by 10000*rank_dat/gene_numbers
                2. 'log' : Do not rank, but transform data by log(data + exp(1)), while data = data[data<1] =1.
                3. 'log_rank': Rank your expression data, and transform by log(10000*rank_dat/gene_numbers+ exp(1))
-               4. 'custom': Do nothing, and use your own rank value to calculate enrichment score.
+               4. None or 'custom': Do nothing, and use your own rank value to calculate enrichment score.
 
     see here: https://github.com/GSEA-MSigDB/ssGSEAProjection-gpmodule/blob/master/src/ssGSEAProjection.Library.R, line 86
 
-    :param str correl_norm_type: correlation normalization type. Choose from {'rank', 'symrank', 'zscore'}. Default: rank.
+    :param str correl_norm_type: correlation normalization type. Choose from {'rank', 'symrank', 'zscore', None}. Default: rank.
             After ordering genes by sample_norm_method, further data transformed could be applied to get enrichment score.
 
-            when weight==0, sample_norm_method and correl_norm_type do not matter;
+            when weight == 0, sample_norm_method and correl_norm_type do not matter;
             when weight > 0, the combination of sample_norm_method and correl_norm_type
             dictate how the gene expression values in input data are transformed
             to obtain the score -- use this setting with care (the transformations
@@ -209,7 +210,7 @@ def ssgsea(
             sample_norm_method will first transformed and rank original data. the data is named correl_vector for each sample.
             then correl_vector is transformed again by
 
-            1. correl_norm_type =='rank':  do nothing, genes are weighted by actual correl_vector.
+            1. correl_norm_type is None or 'rank' :  do nothing, genes are weighted by actual correl_vector.
             2. correl_norm_type =='symrank': symmetric ranking.
             3. correl_norm_type =='zscore':  standardizes the correl_vector before using them to calculate scores.
 
diff --git a/gseapy/__main__.py b/gseapy/__main__.py
index 33160c7..9a9d68f 100644
--- a/gseapy/__main__.py
+++ b/gseapy/__main__.py
@@ -85,7 +85,7 @@ def main():
         pre.run()
 
     elif subcommand == "ssgsea":
-        from .gsea import SingleSampleGSEA
+        from .ssgsea import SingleSampleGSEA
 
         ss = SingleSampleGSEA(
             data=args.data,
diff --git a/gseapy/gsea.py b/gseapy/gsea.py
index 7da5eba..bf0cf7c 100644
--- a/gseapy/gsea.py
+++ b/gseapy/gsea.py
@@ -13,13 +13,10 @@
 
 from gseapy.base import GSEAbase
 from gseapy.gse import (
-    CorrelType,
     Metric,
     gsea_rs,
     prerank2d_rs,
     prerank_rs,
-    ssgsea_rs,
-    gsva_rs,
 )
 from gseapy.parser import gsea_cls_parser
 from gseapy.plot import gseaplot
@@ -481,203 +478,6 @@ def run(self):
         return
 
 
-class SingleSampleGSEA(GSEAbase):
-    """GSEA extension: single sample GSEA"""
-
-    def __init__(
-        self,
-        data: Union[pd.DataFrame, pd.Series, str],
-        gene_sets: Union[List[str], str, Dict[str, str]],
-        outdir: Optional[str] = None,
-        sample_norm_method: str = "rank",
-        correl_norm_type: str = "zscore",
-        min_size: int = 15,
-        max_size: int = 500,
-        permutation_num: Optional[int] = None,
-        weight: float = 0.25,
-        ascending: bool = False,
-        threads: int = 1,
-        figsize: Tuple[float, float] = (6.5, 6),
-        format: str = "pdf",
-        graph_num: int = 20,
-        no_plot: bool = True,
-        seed: int = 123,
-        verbose: bool = False,
-        **kwargs,
-    ):
-        super(SingleSampleGSEA, self).__init__(
-            outdir=outdir,
-            gene_sets=gene_sets,
-            module="ssgsea",
-            threads=threads,
-            verbose=verbose,
-        )
-        self.data = data
-        self.sample_norm_method = sample_norm_method
-        self.correl_type = self.norm_correl(correl_norm_type)
-        self.weight = weight
-        self.min_size = min_size
-        self.max_size = max_size
-        self.permutation_num = int(permutation_num) if permutation_num else None
-        self.ascending = ascending
-        self.figsize = figsize
-        self.format = format
-        self.graph_num = int(graph_num)
-        self.seed = seed
-        self.ranking = None
-        self._noplot = no_plot
-        self.permutation_type = "gene_set"
-
-    def corplot(self):
-        """NES Correlation plot
-        TODO
-        """
-
-    def setplot(self):
-        """ranked genes' location plot
-        TODO
-        """
-
-    def load_data(self) -> pd.DataFrame:
-        # load data
-        exprs = self.data
-        if isinstance(exprs, pd.DataFrame):
-            rank_metric = exprs.copy()
-            # handle dataframe with gene_name as index.
-            self._logger.debug("Input data is a DataFrame with gene names")
-            # handle index is not gene_names
-            if rank_metric.index.dtype != "O":
-                rank_metric.set_index(keys=rank_metric.columns[0], inplace=True)
-            if rank_metric.columns.dtype != "O":
-                rank_metric.columns = rank_metric.columns.astype(str)
-
-            rank_metric = rank_metric.select_dtypes(include=[np.number])
-        elif isinstance(exprs, pd.Series):
-            # change to DataFrame
-            self._logger.debug("Input data is a Series with gene names")
-            if exprs.name is None:
-                # rename col if name attr is none
-                exprs.name = "sample1"
-            elif exprs.name.dtype != "O":
-                exprs.name = exprs.name.astype(str)
-            rank_metric = exprs.to_frame()
-        elif os.path.isfile(exprs):
-            # GCT input format?
-            if exprs.endswith("gct"):
-                rank_metric = pd.read_csv(
-                    exprs, skiprows=1, comment="#", index_col=0, sep="\t"
-                )
-            else:
-                # just txt file like input
-                sep = "\t"
-                if exprs.endswith("csv"):
-                    sep = ","
-                rank_metric = pd.read_csv(exprs, comment="#", index_col=0, sep=sep)
-                if rank_metric.shape[1] == 1:
-                    # rnk file like input
-                    rank_metric.columns = rank_metric.columns.astype(str)
-            # select numbers
-            rank_metric = rank_metric.select_dtypes(include=[np.number])
-        else:
-            raise Exception("Error parsing gene ranking values!")
-
-        if rank_metric.isnull().any().sum() > 0:
-            self._logger.warning("Input data contains NA, filled NA with 0")
-            rank_metric = rank_metric.fillna(0)
-
-        if rank_metric.index.duplicated().sum() > 0:
-            self._logger.warning(
-                "Found duplicated gene names, values averaged by gene names!"
-            )
-            rank_metric = rank_metric.groupby(level=0).mean()
-        return rank_metric
-
-    def norm_samples(self, dat: pd.DataFrame) -> pd.DataFrame:
-        """normalization samples
-        see here: https://github.com/broadinstitute/ssGSEA2.0/blob/f682082f62ae34185421545f25041bae2c78c89b/src/ssGSEA2.0.R#L237
-        """
-
-        if self.sample_norm_method == "rank":
-            data = dat.rank(axis=0, method="average", na_option="bottom")
-            data = 10000 * data / data.shape[0]
-        elif self.sample_norm_method == "log_rank":
-            data = dat.rank(axis=0, method="average", na_option="bottom")
-            data = np.log(10000 * data / data.shape[0] + np.exp(1))
-        elif self.sample_norm_method == "log":
-            dat[dat < 1] = 1
-            data = np.log(dat + np.exp(1))
-        elif self.sample_norm_method == "custom":
-            self._logger.info("Use custom rank metric for ssGSEA")
-            data = dat
-        else:
-            raise Exception("No supported method: %s" % self.sample_norm_method)
-
-        return data
-
-    def norm_correl(self, cortype):
-        """
-        After norm_samples, input value will be further nomizalized before using them to calculate scores.
-        see source code:
-        https://github.com/broadinstitute/ssGSEA2.0/blob/f682082f62ae34185421545f25041bae2c78c89b/src/ssGSEA2.0.R#L396
-        """
-        if cortype == "zscore":
-            return CorrelType.ZScore
-        elif cortype == "rank":
-            return CorrelType.Rank
-        elif cortype == "symrank":
-            return CorrelType.SymRank
-        else:
-            raise Exception("unsupported correl type input")
-
-    def run(self):
-        """run entry"""
-        if self.permutation_num is None:
-            self.permutation_num = 0
-        self._logger.info("Parsing data files for ssGSEA...........................")
-        # load data
-        data = self.load_data()
-        # normalized samples, and rank
-        normdat = self.norm_samples(data)
-        self.ranking = normdat
-        # filtering out gene sets and build gene sets dictionary
-        gmt = self.load_gmt(gene_list=normdat.index.values, gmt=self.gene_sets)
-        self.gmt = gmt
-        self._logger.info(
-            "%04d gene_sets used for further statistical testing....." % len(gmt)
-        )
-        # start analysis
-        self._logger.info("Start to run ssGSEA...Might take a while................")
-        if self.permutation_num > 0:
-            # run permutation procedure and calculate pvals, fdrs
-            self._logger.warning(
-                "Run ssGSEA with permutation procedure, don't use the pval, fdr results for publication."
-            )
-        self.runSamplesPermu(df=normdat, gmt=gmt)
-
-    def runSamplesPermu(
-        self, df: pd.DataFrame, gmt: Optional[Dict[str, List[str]]] = None
-    ):
-        """Single Sample GSEA workflow with permutation procedure"""
-
-        assert self.min_size <= self.max_size
-        if self._outdir:
-            mkdirs(self.outdir)
-        gsum = ssgsea_rs(
-            df.index.values.tolist(),
-            df.values.tolist(),
-            gmt,
-            self.weight,
-            self.min_size,
-            self.max_size,
-            self.permutation_num,  # permutate just like runing prerank analysis
-            self.correl_type,
-            self._threads,
-            self.seed,
-        )
-        self.to_df(gsum.summaries, gmt, df, gsum.indices)
-        return
-
-
 class Replot(GSEAbase):
     """To reproduce GSEA desktop output results."""
 
diff --git a/gseapy/ssgsea.py b/gseapy/ssgsea.py
new file mode 100644
index 0000000..168b57d
--- /dev/null
+++ b/gseapy/ssgsea.py
@@ -0,0 +1,214 @@
+#! python
+# -*- coding: utf-8 -*-
+import os
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import pandas as pd
+
+from gseapy.base import GSEAbase
+from gseapy.gse import CorrelType, ssgsea_rs
+from gseapy.utils import mkdirs
+
+
+class SingleSampleGSEA(GSEAbase):
+    """GSEA extension: single sample GSEA"""
+
+    def __init__(
+        self,
+        data: Union[pd.DataFrame, pd.Series, str],
+        gene_sets: Union[List[str], str, Dict[str, str]],
+        outdir: Optional[str] = None,
+        sample_norm_method: Optional[str] = "rank",
+        correl_norm_type: Optional[str] = None,
+        min_size: int = 15,
+        max_size: int = 500,
+        permutation_num: Optional[int] = None,
+        weight: float = 0.25,
+        ascending: bool = False,
+        threads: int = 1,
+        figsize: Tuple[float, float] = (6.5, 6),
+        format: str = "pdf",
+        graph_num: int = 20,
+        no_plot: bool = True,
+        seed: int = 123,
+        verbose: bool = False,
+        **kwargs,
+    ):
+        super(SingleSampleGSEA, self).__init__(
+            outdir=outdir,
+            gene_sets=gene_sets,
+            module="ssgsea",
+            threads=threads,
+            verbose=verbose,
+        )
+        self.data = data
+        self.sample_norm_method = sample_norm_method
+        self.correl_type = self.norm_correl(correl_norm_type)
+        self.weight = weight
+        self.min_size = min_size
+        self.max_size = max_size
+        self.permutation_num = int(permutation_num) if permutation_num else None
+        self.ascending = ascending
+        self.figsize = figsize
+        self.format = format
+        self.graph_num = int(graph_num)
+        self.seed = seed
+        self.ranking = None
+        self._noplot = no_plot
+        self.permutation_type = "gene_set"
+
+    def corplot(self):
+        """NES Correlation plot
+        TODO
+        """
+
+    def setplot(self):
+        """ranked genes' location plot
+        TODO
+        """
+
+    def load_data(self) -> pd.DataFrame:
+        # load data
+        exprs = self.data
+        if isinstance(exprs, pd.DataFrame):
+            rank_metric = exprs.copy()
+            # handle dataframe with gene_name as index.
+            self._logger.debug("Input data is a DataFrame with gene names")
+            # handle index is not gene_names
+            if rank_metric.index.dtype != "O":
+                rank_metric.set_index(keys=rank_metric.columns[0], inplace=True)
+            if rank_metric.columns.dtype != "O":
+                rank_metric.columns = rank_metric.columns.astype(str)
+
+            rank_metric = rank_metric.select_dtypes(include=[np.number])
+        elif isinstance(exprs, pd.Series):
+            # change to DataFrame
+            self._logger.debug("Input data is a Series with gene names")
+            if exprs.name is None:
+                # rename col if name attr is none
+                exprs.name = "sample1"
+            elif exprs.name.dtype != "O":
+                exprs.name = exprs.name.astype(str)
+            rank_metric = exprs.to_frame()
+        elif os.path.isfile(exprs):
+            # GCT input format?
+            if exprs.endswith("gct"):
+                rank_metric = pd.read_csv(
+                    exprs, skiprows=1, comment="#", index_col=0, sep="\t"
+                )
+            else:
+                # just txt file like input
+                sep = "\t"
+                if exprs.endswith("csv"):
+                    sep = ","
+                rank_metric = pd.read_csv(exprs, comment="#", index_col=0, sep=sep)
+                if rank_metric.shape[1] == 1:
+                    # rnk file like input
+                    rank_metric.columns = rank_metric.columns.astype(str)
+            # select numbers
+            rank_metric = rank_metric.select_dtypes(include=[np.number])
+        else:
+            raise Exception("Error parsing gene ranking values!")
+
+        if rank_metric.isnull().any().sum() > 0:
+            self._logger.warning("Input data contains NA, filled NA with 0")
+            rank_metric = rank_metric.fillna(0)
+
+        if rank_metric.index.duplicated().sum() > 0:
+            self._logger.warning(
+                "Found duplicated gene names, values averaged by gene names!"
+            )
+            rank_metric = rank_metric.groupby(level=0).mean()
+        return rank_metric
+
+    def norm_samples(self, dat: pd.DataFrame) -> pd.DataFrame:
+        """normalization samples
+        see here: https://github.com/broadinstitute/ssGSEA2.0/blob/f682082f62ae34185421545f25041bae2c78c89b/src/ssGSEA2.0.R#L237
+        """
+        if self.sample_norm_method is None:
+            self._logger.info("Use input as rank metric for ssGSEA")
+            return dat
+
+        if self.sample_norm_method == "rank":
+            data = dat.rank(axis=0, method="average", na_option="bottom")
+            data = 10000 * data / data.shape[0]
+        elif self.sample_norm_method == "log_rank":
+            data = dat.rank(axis=0, method="average", na_option="bottom")
+            data = np.log(10000 * data / data.shape[0] + np.exp(1))
+        elif self.sample_norm_method == "log":
+            dat[dat < 1] = 1
+            data = np.log(dat + np.exp(1))
+        elif self.sample_norm_method == "custom":
+            self._logger.info("Use custom rank metric for ssGSEA")
+            data = dat
+        else:
+            raise Exception("No supported method: %s" % self.sample_norm_method)
+
+        return data
+
+    def norm_correl(self, cortype):
+        """
+        After norm_samples, input value will be further nomizalized before using them to calculate scores.
+        see source code:
+        https://github.com/broadinstitute/ssGSEA2.0/blob/f682082f62ae34185421545f25041bae2c78c89b/src/ssGSEA2.0.R#L396
+        """
+        if cortype is None:
+            return CorrelType.Rank
+
+        if cortype == "zscore":
+            return CorrelType.ZScore
+        elif cortype == "rank":
+            return CorrelType.Rank
+        elif cortype == "symrank":
+            return CorrelType.SymRank
+        else:
+            raise Exception("unsupported correl type input")
+
+    def run(self):
+        """run entry"""
+        if self.permutation_num is None:
+            self.permutation_num = 0
+        self._logger.info("Parsing data files for ssGSEA...........................")
+        # load data
+        data = self.load_data()
+        # normalized samples, and rank
+        normdat = self.norm_samples(data)
+        self.ranking = normdat
+        # filtering out gene sets and build gene sets dictionary
+        gmt = self.load_gmt(gene_list=normdat.index.values, gmt=self.gene_sets)
+        self.gmt = gmt
+        self._logger.info(
+            "%04d gene_sets used for further statistical testing....." % len(gmt)
+        )
+        # start analysis
+        self._logger.info("Start to run ssGSEA...Might take a while................")
+        if self.permutation_num > 0:
+            # run permutation procedure and calculate pvals, fdrs
+            self._logger.warning(
+                "Run ssGSEA with permutation procedure, don't use the pval, fdr results for publication."
+            )
+        self.runSamplesPermu(df=normdat, gmt=gmt)
+
+    def runSamplesPermu(
+        self, df: pd.DataFrame, gmt: Optional[Dict[str, List[str]]] = None
+    ):
+        """Single Sample GSEA workflow with permutation procedure"""
+
+        assert self.min_size <= self.max_size
+        if self._outdir:
+            mkdirs(self.outdir)
+        gsum = ssgsea_rs(
+            df.index.values.tolist(),
+            df.values.tolist(),
+            gmt,
+            self.weight,
+            self.min_size,
+            self.max_size,
+            self.permutation_num,  # permutate just like runing prerank analysis
+            self.correl_type,
+            self._threads,
+            self.seed,
+        )
+        self.to_df(gsum.summaries, gmt, df, gsum.indices)
+        return