From ed2516c606f8617ef387a805b33951b9077b0cfc Mon Sep 17 00:00:00 2001
From: zqfang <fangzq@stanford.edu>
Date: Tue, 21 Nov 2023 12:43:07 -0800
Subject: [PATCH] expr data parsing, refactor

---
 gseapy/base.py   | 152 ++++++++++++++++++++----------------
 gseapy/gsea.py   | 196 +++++++++++++++++++++++++++--------------------
 gseapy/gsva.py   |  55 +------------
 gseapy/ssgsea.py |  53 +------------
 4 files changed, 201 insertions(+), 255 deletions(-)

diff --git a/gseapy/base.py b/gseapy/base.py
index 2227acf..9e749f4 100644
--- a/gseapy/base.py
+++ b/gseapy/base.py
@@ -139,84 +139,102 @@ def _set_cores(self):
         # have to be int if user input is float
         self._threads = int(cores)
 
-    def _load_ranking(self, rnk: Union[pd.DataFrame, pd.Series, str]) -> pd.Series:
-        """Parse ranking file. This file contains ranking correlation vector( or expression values)
-        and gene names or ids.
+    def _read_file(self, path: str) -> pd.DataFrame:
+        """
+        read file, and return dataframe (first column are gene IDs)
+        """
+        # just txt file like input
+        header, sep = "infer", "\t"
+        # GCT input format?
+        if path.endswith(".gct"):
+            rank_metric = pd.read_csv(
+                path, skiprows=1, comment="#", index_col=0, sep=sep
+            )
+        else:
+            if path.endswith(".csv"):
+                sep = ","
+            if path.endswith(".rnk"):
+                header = None
+            rank_metric = pd.read_csv(
+                path, comment="#", index_col=0, sep=sep, header=header
+            )
+            if rank_metric.shape[1] == 1:
+                # rnk file like input
+                rank_metric.columns = rank_metric.columns.astype(str)
 
-         :param rnk: the .rnk file of GSEA input or a Pandas DataFrame, Series instance.
-         :return: a Pandas Series with gene name indexed rankings
+        return rank_metric.select_dtypes(include=[np.number]).reset_index()
 
+    def _load_data(self, exprs: Union[str, pd.Series, pd.DataFrame]) -> pd.DataFrame:
+        """
+        helper function to read data
         """
         # load data
-        if isinstance(rnk, pd.DataFrame):
-            rank_metric = rnk.copy()
+        if isinstance(exprs, pd.DataFrame):
+            rank_metric = exprs.copy()
             # handle dataframe with gene_name as index.
-            if rnk.shape[1] == 1:
-                rank_metric = rnk.reset_index()
-        elif isinstance(rnk, pd.Series):
-            rank_metric = rnk.reset_index()
-        elif os.path.isfile(rnk):
-            sep = "," if rnk.endswith(".csv") else "\t"
-            rank_metric = pd.read_table(rnk, header=None, sep=sep)
+            self._logger.debug("Input data is a DataFrame with gene names")
+            # handle index is already gene_names
+            if rank_metric.index.dtype == "O":
+                rank_metric = rank_metric.reset_index()
+                # rank_metric.set_index(keys=rank_metric.columns[0], inplace=True)
+            # if rank_metric.columns.dtype != "O":
+            rank_metric.columns = rank_metric.columns.astype(str)
+
+        elif isinstance(exprs, pd.Series):
+            # change to DataFrame
+            self._logger.debug("Input data is a Series with gene names")
+            if exprs.name is None:
+                # rename col if name attr is none
+                exprs.name = "sample1"
+            elif exprs.name.dtype != "O":
+                exprs.name = exprs.name.astype(str)
+            rank_metric = exprs.reset_index()
+        elif os.path.isfile(exprs):
+            rank_metric = self._read_file(exprs)
+
         else:
-            raise Exception("Error parsing gene ranking values!")
-
-        if rank_metric.select_dtypes(np.number).shape[1] > 1:
-            return rank_metric
-        # sort ranking values from high to low
-        rnk_cols = rank_metric.columns
-        rank_metric.sort_values(by=rnk_cols[1], ascending=self.ascending, inplace=True)
-        # drop na values
-        if rank_metric.isnull().any(axis=1).sum() > 0:
-            self._logger.warning(
-                "Input gene rankings contains NA values(gene name and ranking value), drop them all!"
-            )
-            # print out NAs
-            NAs = rank_metric[rank_metric.isnull().any(axis=1)]
-            self._logger.debug("NAs list:\n" + NAs.to_string())
-            rank_metric.dropna(how="any", inplace=True)
-        # drop duplicate IDs, keep the first
-        if rank_metric.duplicated(subset=rnk_cols[0]).sum() > 0:
-            self._logger.info("Input gene rankings contains duplicated IDs")
-            mask = rank_metric.duplicated(subset=rnk_cols[0]).duplicated(keep=False)
-            dups = (
-                rank_metric[mask]
-                .groupby(rnk_cols[0])
-                .cumcount()
-                .map(lambda c: "_" + str(c) if c else "")
-            )
-            rank_metric.loc[mask, rnk_cols[0]] = (
-                rank_metric.loc[mask, rnk_cols[0]] + dups
-            )
-            # dups = rank_metric[rank_metric.duplicated(subset=rnk_cols[0])]
-            # self._logger.debug("Dups list:\n" + dups.to_string())
-            # rank_metric.drop_duplicates(
-            #     subset=rank_metric.columns[0], inplace=True, keep="first"
-            # )
-        # reset ranking index, because you have sort values and drop duplicates.
-        rank_metric.reset_index(drop=True, inplace=True)
-        rank_metric.columns = ["gene_name", "prerank"]
-        rankser = rank_metric.set_index("gene_name", drop=True).squeeze()
+            raise Exception("Error parsing expression values!")
+        # select numbers
+        # rank_metric = rank_metric.select_dtypes(include=[np.number])
+        return rank_metric
 
+    def _check_data(self, exprs: pd.DataFrame) -> pd.DataFrame:
+        """
+        check NAs, duplicates.
+        exprs: dataframe, the frist column must be gene identifiers
+        """
+        ## if gene names contain NA, drop them
+        if exprs.iloc[:, 0].isnull().any():
+            exprs.dropna(subset=[exprs.columns[0]])
+        ## then fill na for numeric columns
+        if exprs.isnull().any().sum() > 0:
+            self._logger.warning("Input data contains NA, filled NA with 0")
+            exprs.dropna(how="all", inplace=True)  # drop rows with all NAs
+            exprs = exprs.fillna(0)
+        ## check duplicated IDs
+        # gene_id = exprs.columns[0]
+        # if exprs.duplicated(subset=gene_id).sum() > 0:
+        #     self._logger.info("Found duplicated gene names, make unique")
+        #     mask = exprs.duplicated(subset=gene_id, keep=False) #
+        #     dups = exprs.loc[mask, gene_id].groupby(gene_id).cumcount().map(lambda c: "_" + str(c) if c else "")
+        #     exprs.loc[mask, gene_id] = exprs.loc[mask, gene_id] + dups
         # check whether contains infinity values
-        if np.isinf(rankser).values.sum() > 0:
-            self._logger.warning("Input gene rankings contains inf values!")
-            rankser.replace(-np.inf, method="ffill", inplace=True)
-            rankser.replace(np.inf, method="bfill", inplace=True)
 
-        # check duplicate values and warning
-        dups = rankser.duplicated().sum()
-        if dups > 0:
-            msg = (
-                "Duplicated values found in preranked stats: {:.2%} of genes\n".format(
-                    dups / rankser.size
-                )
+        # set gene name as index
+        exprs.set_index(keys=exprs.columns[0], inplace=True)
+        # select numberic columns
+        df = exprs.select_dtypes(include=[np.number])
+        # microarray data may contained multiple probs of same gene, average them
+        if exprs.index.duplicated().sum() > 0:
+            self._logger.warning(
+                "Found duplicated gene names, values averaged by gene names!"
             )
-            msg += "The order of those genes will be arbitrary, which may produce unexpected results."
-            self._logger.warning(msg)
+            df = df.groupby(level=0).mean()
 
-        # return series
-        return rankser
+        if np.isinf(df).values.sum() > 0:
+            self._logger.warning("Input gene rankings contains inf values!")
+            df = df.apply()
+        return df
 
     def load_gmt_only(
         self, gmt: Union[List[str], str, Dict[str, str]]
diff --git a/gseapy/gsea.py b/gseapy/gsea.py
index 5f3bbff..e623a4f 100644
--- a/gseapy/gsea.py
+++ b/gseapy/gsea.py
@@ -68,52 +68,13 @@ def __init__(
         # phenotype labels parsing
         self.load_classes(classes)
 
-    def load_data(
-        self, groups: Union[List[str], Dict[str, Any]]
-    ) -> Tuple[pd.DataFrame, Dict]:
+    def load_data(self) -> Tuple[pd.DataFrame, Dict]:
         """pre-processed the data frame.new filtering methods will be implement here."""
-        # read data in
-        if isinstance(self.data, pd.DataFrame):
-            exprs = self.data.copy()
-            # handle index is gene_names
-            if exprs.index.dtype == "O":
-                exprs = exprs.reset_index()
-        elif os.path.isfile(self.data):
-            # GCT input format?
-            if self.data.endswith("gct"):
-                exprs = pd.read_csv(self.data, skiprows=2, sep="\t")
-            else:
-                sep = "\t"
-                if self.data.endswith("csv"):
-                    sep = ","
-                exprs = pd.read_csv(self.data, comment="#", sep=sep)
-        else:
-            raise Exception("Error parsing gene expression DataFrame!")
-
+        exprs = self._load_data(self.data)
         exprs = self._check_data(exprs)
         exprs, cls_dict = self._filter_data(exprs)
         return exprs, cls_dict
 
-    def _check_data(self, exprs: pd.DataFrame) -> pd.DataFrame:
-        """
-        check NAs, duplicates.
-        """
-        if exprs.isnull().any().sum() > 0:
-            self._logger.warning("Input data contains NA, filled NA with 0")
-            exprs.dropna(how="all", inplace=True)  # drop rows with all NAs
-            exprs = exprs.fillna(0)
-        # set gene name as index
-        exprs.set_index(keys=exprs.columns[0], inplace=True)
-        # select numberic columns
-        df = exprs.select_dtypes(include=[np.number])
-
-        if exprs.index.duplicated().sum() > 0:
-            self._logger.warning(
-                "Found duplicated gene names, values averaged by gene names!"
-            )
-            df = df.groupby(level=0).mean()
-        return df
-
     def _map_classes(self, sample_names: List[str]) -> Dict[str, Any]:
         """
         update
@@ -283,7 +244,7 @@ def run(self):
         # Start Analysis
         self._logger.info("Parsing data files for GSEA.............................")
         # select correct expression genes and values.
-        dat, cls_dict = self.load_data(self.groups)
+        dat, cls_dict = self.load_data()
         # data frame must have length > 1
         assert len(dat) > 1
         # filtering out gene sets and build gene sets dictionary
@@ -400,47 +361,111 @@ def __init__(
         self._noplot = no_plot
         self.permutation_type = "gene_set"
 
-    def load_ranking(self):
-        ranking = self._load_ranking(self.rnk)  # index is gene_names
-        if isinstance(ranking, pd.DataFrame):
-            # handle index is not gene_names
-            if ranking.index.dtype != "O":
-                ranking.set_index(keys=ranking.columns[0], inplace=True)
-            # handle columns names are intergers
-            if ranking.columns.dtype != "O":
-                ranking.columns = ranking.columns.astype(str)
-            # drop na values
-            ranking = ranking.loc[ranking.index.dropna()]
-            if ranking.isnull().any().sum() > 0:
-                self._logger.warning("Input rankings contains NA values!")
-                # fill na
-                ranking.dropna(how="all", inplace=True)
-                ranking.fillna(0, inplace=True)
-            # drop duplicate IDs, keep the first
-            if ranking.index.duplicated().sum() > 0:
-                self._logger.warning("Duplicated ID detected!")
-                ranking = ranking.loc[~ranking.index.duplicated(keep="first"), :]
-
-            # check whether contains infinity values
-            if np.isinf(ranking).values.sum() > 0:
-                self._logger.warning("Input gene rankings contains inf values!")
-                col_min_max = {
-                    np.inf: ranking[np.isfinite(ranking)].max(),  # column-wise max
-                    -np.inf: ranking[np.isfinite(ranking)].min(),
-                }  # column-wise min
-                ranking = ranking.replace({col: col_min_max for col in ranking.columns})
-
-            # check ties in prerank stats
-            dups = ranking.apply(lambda df: df.duplicated().sum() / df.size)
-            if (dups > 0).sum() > 0:
-                msg = (
-                    "Duplicated values found in preranked stats:\nsample\tratio\n%s\n"
-                    % (dups.to_string(float_format="{:,.2%}".format))
+    def make_unique(self, rank_metric: pd.DataFrame, col_idx: int) -> pd.DataFrame:
+        """
+        make gene id column unique
+        """
+        id_col = rank_metric.columns[col_idx]
+        if rank_metric.duplicated(subset=id_col).sum() > 0:
+            self._logger.info("Input gene rankings contains duplicated IDs")
+            mask = rank_metric.duplicated(subset=id_col, keep=False)
+            dups = (
+                rank_metric.loc[mask, id_col]
+                .groupby(id_col)
+                .cumcount()
+                .map(lambda c: "_" + str(c) if c else "")
+            )
+            rank_metric.loc[mask, id_col] = rank_metric.loc[mask, id_col] + dups
+        return rank_metric
+
+    def _load_ranking(self, rank_metric: pd.DataFrame) -> pd.Series:
+        """Parse ranking
+        rank_metric: two column dataframe. first column is gene ids
+
+        """
+        # load data
+        # sort ranking values from high to low
+        rnk_cols = rank_metric.columns
+        # if not ranking.is_monotonic_decreasing:
+        #     ranking = ranking.sort_values(ascending=self.ascending)
+        rank_metric.sort_values(by=rnk_cols[1], ascending=self.ascending, inplace=True)
+        # drop na values
+        if rank_metric.isnull().any(axis=1).sum() > 0:
+            self._logger.warning(
+                "Input gene rankings contains NA values(gene name and ranking value), drop them all!"
+            )
+            # print out NAs
+            NAs = rank_metric[rank_metric.isnull().any(axis=1)]
+            self._logger.debug("NAs list:\n" + NAs.to_string())
+            rank_metric.dropna(how="any", inplace=True)
+        # rename duplicate id, make them unique
+        rank_metric = self.make_unique(rank_metric, col_idx=0)
+        # reset ranking index, because you have sort values and drop duplicates.
+        rank_metric.reset_index(drop=True, inplace=True)
+        rank_metric.columns = ["gene_name", "prerank"]
+        rankser = rank_metric.set_index("gene_name", drop=True).squeeze()
+
+        # check whether contains infinity values
+        if np.isinf(rankser).values.sum() > 0:
+            self._logger.warning("Input gene rankings contains inf values!")
+            rankser.replace(-np.inf, method="ffill", inplace=True)
+            rankser.replace(np.inf, method="bfill", inplace=True)
+
+        # check duplicate values and warning
+        dups = rankser.duplicated().sum()
+        if dups > 0:
+            msg = (
+                "Duplicated values found in preranked stats: {:.2%} of genes\n".format(
+                    dups / rankser.size
                 )
-                msg += "The order of those genes will be arbitrary, which may produce unexpected results."
-                self._logger.warning(msg)
+            )
+            msg += "The order of those genes will be arbitrary, which may produce unexpected results."
+            self._logger.warning(msg)
+
+        # return series
+        return rankser
+
+    def load_ranking(self):
+        """
+        parse rnk input
+        """
+        rank_metric = self._load_data(self.rnk)  # gene id is the first column
+        if rank_metric.select_dtypes(np.number).shape[1] == 1:
+            # return series
+            return self._load_ranking(rank_metric)
+        ## In case the input type multi-column ranking dataframe
+        # drop na gene id values
+        rank_metric = rank_metric.dropna(subset=rank_metric.columns[0])
+        # make unique
+        rank_metric = self.make_unique(rank_metric, col_idx=0)
+        # set index
+        rank_metric.set_index(keys=rank_metric.columns[0], inplace=True)
+        if rank_metric.isnull().any().sum() > 0:
+            self._logger.warning("Input rankings contains NA values!")
+            # fill na
+            rank_metric.dropna(how="all", inplace=True)
+            rank_metric.fillna(0, inplace=True)
+
+        # check whether contains infinity values
+        if np.isinf(rank_metric).values.sum() > 0:
+            self._logger.warning("Input gene rankings contains inf values!")
+            col_min_max = {
+                np.inf: rank_metric[np.isfinite(rank_metric)].max(),  # column-wise max
+                -np.inf: rank_metric[np.isfinite(rank_metric)].min(),  # column-wise min
+            }
+            rank_metric = rank_metric.replace(
+                {col: col_min_max for col in rank_metric.columns}
+            )
+        # check ties in prerank stats
+        dups = rank_metric.apply(lambda df: df.duplicated().sum() / df.size)
+        if (dups > 0).sum() > 0:
+            msg = "Duplicated values found in preranked stats:\nsample\tratio\n%s\n" % (
+                dups.to_string(float_format="{:,.2%}".format)
+            )
+            msg += "The order of those genes will be arbitrary, which may produce unexpected results."
+            self._logger.warning(msg)
 
-        return ranking
+        return rank_metric
 
     # @profile
     def run(self):
@@ -576,9 +601,10 @@ def run(self):
         # obtain gene sets
         gene_set_dict = self.parse_gmt(gmt=gene_set_path)
         # obtain rank_metrics
-        rank_metric = self._load_ranking(rank_path)
-        correl_vector = rank_metric.values
-        gene_list = rank_metric.index.values
+        rank_metric = self._load_data(rank_path)
+        # rank_metric = rank_metric.set_index(rank_metric.columns[0])
+        correl_vector = rank_metric.iloc[:, 1].values
+        gene_list = rank_metric.iloc[:, 0].values
         # extract each enriment term in the results.edb files and plot.
 
         database = self.gsea_edb_parser(results_path)
@@ -600,7 +626,7 @@ def run(self):
                 self.outdir, term, self.module, self.format
             )
             gseaplot(
-                rank_metric=rank_metric,
+                rank_metric=correl_vector,
                 term=enrich_term,
                 hits=hit_ind,
                 nes=nes,
diff --git a/gseapy/gsva.py b/gseapy/gsva.py
index 42c4a5a..bc0e396 100644
--- a/gseapy/gsva.py
+++ b/gseapy/gsva.py
@@ -67,58 +67,9 @@ def __init__(
 
     def load_data(self) -> pd.DataFrame:
         # load data
-        exprs = self.data
-        if isinstance(exprs, pd.DataFrame):
-            rank_metric = exprs.copy()
-            # handle dataframe with gene_name as index.
-            self._logger.debug("Input data is a DataFrame with gene names")
-            # handle index is not gene_names
-            if rank_metric.index.dtype != "O":
-                rank_metric.set_index(keys=rank_metric.columns[0], inplace=True)
-            if rank_metric.columns.dtype != "O":
-                rank_metric.columns = rank_metric.columns.astype(str)
-
-            rank_metric = rank_metric.select_dtypes(include=[np.number])
-        elif isinstance(exprs, pd.Series):
-            # change to DataFrame
-            self._logger.debug("Input data is a Series with gene names")
-            if exprs.name is None:
-                # rename col if name attr is none
-                exprs.name = "sample1"
-            elif exprs.name.dtype != "O":
-                exprs.name = exprs.name.astype(str)
-            rank_metric = exprs.to_frame()
-        elif os.path.isfile(exprs):
-            # GCT input format?
-            if exprs.endswith("gct"):
-                rank_metric = pd.read_csv(
-                    exprs, skiprows=1, comment="#", index_col=0, sep="\t"
-                )
-            else:
-                sep = "\t"
-                if exprs.endswith("csv"):
-                    sep = ","
-                # just txt file like input
-                rank_metric = pd.read_csv(exprs, comment="#", index_col=0, sep=sep)
-                if rank_metric.shape[1] == 1:
-                    # rnk file like input
-                    rank_metric.columns = rank_metric.columns.astype(str)
-            # select numbers
-            rank_metric = rank_metric.select_dtypes(include=[np.number])
-        else:
-            raise Exception("Error parsing gene ranking values!")
-
-        if rank_metric.isnull().any().sum() > 0:
-            self._logger.warning("Input data contains NA, filled NA with 0")
-            rank_metric = rank_metric.fillna(0)
-
-        if rank_metric.index.duplicated().sum() > 0:
-            self._logger.warning(
-                "Found duplicated gene names, values averaged by gene names!"
-            )
-            rank_metric = rank_metric.groupby(level=0).mean()
-
-        return rank_metric
+        data = self._load_data(self.data)
+        data = self._check_data(data)
+        return data
 
     def run(self):
         """run entry"""
diff --git a/gseapy/ssgsea.py b/gseapy/ssgsea.py
index 168b57d..25b8716 100644
--- a/gseapy/ssgsea.py
+++ b/gseapy/ssgsea.py
@@ -70,57 +70,8 @@ def setplot(self):
 
     def load_data(self) -> pd.DataFrame:
         # load data
-        exprs = self.data
-        if isinstance(exprs, pd.DataFrame):
-            rank_metric = exprs.copy()
-            # handle dataframe with gene_name as index.
-            self._logger.debug("Input data is a DataFrame with gene names")
-            # handle index is not gene_names
-            if rank_metric.index.dtype != "O":
-                rank_metric.set_index(keys=rank_metric.columns[0], inplace=True)
-            if rank_metric.columns.dtype != "O":
-                rank_metric.columns = rank_metric.columns.astype(str)
-
-            rank_metric = rank_metric.select_dtypes(include=[np.number])
-        elif isinstance(exprs, pd.Series):
-            # change to DataFrame
-            self._logger.debug("Input data is a Series with gene names")
-            if exprs.name is None:
-                # rename col if name attr is none
-                exprs.name = "sample1"
-            elif exprs.name.dtype != "O":
-                exprs.name = exprs.name.astype(str)
-            rank_metric = exprs.to_frame()
-        elif os.path.isfile(exprs):
-            # GCT input format?
-            if exprs.endswith("gct"):
-                rank_metric = pd.read_csv(
-                    exprs, skiprows=1, comment="#", index_col=0, sep="\t"
-                )
-            else:
-                # just txt file like input
-                sep = "\t"
-                if exprs.endswith("csv"):
-                    sep = ","
-                rank_metric = pd.read_csv(exprs, comment="#", index_col=0, sep=sep)
-                if rank_metric.shape[1] == 1:
-                    # rnk file like input
-                    rank_metric.columns = rank_metric.columns.astype(str)
-            # select numbers
-            rank_metric = rank_metric.select_dtypes(include=[np.number])
-        else:
-            raise Exception("Error parsing gene ranking values!")
-
-        if rank_metric.isnull().any().sum() > 0:
-            self._logger.warning("Input data contains NA, filled NA with 0")
-            rank_metric = rank_metric.fillna(0)
-
-        if rank_metric.index.duplicated().sum() > 0:
-            self._logger.warning(
-                "Found duplicated gene names, values averaged by gene names!"
-            )
-            rank_metric = rank_metric.groupby(level=0).mean()
-        return rank_metric
+        exprs = self._load_data(self.data)
+        return self._check_data(exprs)
 
     def norm_samples(self, dat: pd.DataFrame) -> pd.DataFrame:
         """normalization samples