From 1a29be909841d2b0c5e9cfe2141ee224adb0ec13 Mon Sep 17 00:00:00 2001 From: zqfang Date: Sat, 21 Oct 2023 21:59:39 -0700 Subject: [PATCH] minor --- gseapy/base.py | 32 +++++++++++++++++++------------- gseapy/gsea.py | 26 ++++++++++++-------------- gseapy/gsva.py | 23 +++++++++-------------- 3 files changed, 40 insertions(+), 41 deletions(-) diff --git a/gseapy/base.py b/gseapy/base.py index 6e97587..ce2e38d 100644 --- a/gseapy/base.py +++ b/gseapy/base.py @@ -164,9 +164,8 @@ def _load_ranking(self, rnk: Union[pd.DataFrame, pd.Series, str]) -> pd.Series: if rank_metric.select_dtypes(np.number).shape[1] > 1: return rank_metric # sort ranking values from high to low - rank_metric.sort_values( - by=rank_metric.columns[1], ascending=self.ascending, inplace=True - ) + rnk_cols = rank_metric.columns + rank_metric.sort_values(by=rnk_cols[1], ascending=self.ascending, inplace=True) # drop na values if rank_metric.isnull().any(axis=1).sum() > 0: self._logger.warning( @@ -177,16 +176,23 @@ def _load_ranking(self, rnk: Union[pd.DataFrame, pd.Series, str]) -> pd.Series: self._logger.debug("NAs list:\n" + NAs.to_string()) rank_metric.dropna(how="any", inplace=True) # drop duplicate IDs, keep the first - if rank_metric.duplicated(subset=rank_metric.columns[0]).sum() > 0: - self._logger.warning( - "Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!" + if rank_metric.duplicated(subset=rnk_cols[0]).sum() > 0: + self._logger.info("Input gene rankings contains duplicated IDs") + mask = rank_metric.duplicated(subset=rnk_cols[0]).duplicated(keep=False) + dups = ( + rank_metric[mask] + .groupby(rnk_cols[0]) + .cumcount() + .map(lambda c: "_" + str(c) if c else "") ) - # print out duplicated IDs. - dups = rank_metric[rank_metric.duplicated(subset=rank_metric.columns[0])] - self._logger.debug("Dups list:\n" + dups.to_string()) - rank_metric.drop_duplicates( - subset=rank_metric.columns[0], inplace=True, keep="first" + rank_metric.loc[mask, rnk_cols[0]] = ( + rank_metric.loc[mask, rnk_cols[0]] + dups ) + # dups = rank_metric[rank_metric.duplicated(subset=rnk_cols[0])] + # self._logger.debug("Dups list:\n" + dups.to_string()) + # rank_metric.drop_duplicates( + # subset=rank_metric.columns[0], inplace=True, keep="first" + # ) # reset ranking index, because you have sort values and drop duplicates. rank_metric.reset_index(drop=True, inplace=True) rank_metric.columns = ["gene_name", "prerank"] @@ -609,7 +615,7 @@ def to_df( res_df["NES"].abs().sort_values(ascending=False).index ).reset_index(drop=True) res_df.drop(dc, axis=1, inplace=True) - + if self._outdir is not None: out = os.path.join( self.outdir, @@ -751,7 +757,7 @@ def plot( ofname: savefig """ # if hasattr(self, "results"): - if self.module == "ssgsea": + if self.module in ["ssgsea", "gsva"]: raise NotImplementedError("not for ssgsea") keys = list(self._results.keys()) if len(keys) > 1: diff --git a/gseapy/gsea.py b/gseapy/gsea.py index d9a723b..7da5eba 100644 --- a/gseapy/gsea.py +++ b/gseapy/gsea.py @@ -97,13 +97,6 @@ def load_data(self, cls_vec: List[str]) -> Tuple[pd.DataFrame, Dict]: else: raise Exception("Error parsing gene expression DataFrame!") - # drop duplicated gene names - if exprs.iloc[:, 0].duplicated().sum() > 0: - self._logger.warning( - "Dropping duplicated gene names, only keep the first values" - ) - # drop duplicate gene_names. - exprs.drop_duplicates(subset=exprs.columns[0], inplace=True) if exprs.isnull().any().sum() > 0: self._logger.warning("Input data contains NA, filled NA with 0") exprs.dropna(how="all", inplace=True) # drop rows with all NAs @@ -113,6 +106,12 @@ def load_data(self, cls_vec: List[str]) -> Tuple[pd.DataFrame, Dict]: # select numberic columns df = exprs.select_dtypes(include=[np.number]) + if exprs.index.duplicated().sum() > 0: + self._logger.warning( + "Found duplicated gene names, values averaged by gene names!" + ) + exprs = exprs.groupby(level=0).mean() + # in case the description column is numeric if len(cls_vec) == (df.shape[1] - 1): df = df.iloc[:, 1:] @@ -581,16 +580,16 @@ def load_data(self) -> pd.DataFrame: rank_metric = rank_metric.select_dtypes(include=[np.number]) else: raise Exception("Error parsing gene ranking values!") - if rank_metric.index.duplicated().sum() > 0: - self._logger.warning( - "Dropping duplicated gene names, values averaged by gene names!" - ) - rank_metric = rank_metric.loc[rank_metric.index.dropna()] - rank_metric = rank_metric.groupby(level=0).mean() + if rank_metric.isnull().any().sum() > 0: self._logger.warning("Input data contains NA, filled NA with 0") rank_metric = rank_metric.fillna(0) + if rank_metric.index.duplicated().sum() > 0: + self._logger.warning( + "Found duplicated gene names, values averaged by gene names!" + ) + rank_metric = rank_metric.groupby(level=0).mean() return rank_metric def norm_samples(self, dat: pd.DataFrame) -> pd.DataFrame: @@ -679,7 +678,6 @@ def runSamplesPermu( return - class Replot(GSEAbase): """To reproduce GSEA desktop output results.""" diff --git a/gseapy/gsva.py b/gseapy/gsva.py index 09885cd..cbdbdfc 100644 --- a/gseapy/gsva.py +++ b/gseapy/gsva.py @@ -47,10 +47,10 @@ def __init__( self.ranking = None self.permutation_num = 0 self._noplot = True - if kcdf == "Gaussian": + if kcdf in ["Gaussian", "gaussian"]: self.kernel = True self.rnaseq = False - elif kcdf == "Poisson": + elif kcdf in ["Poisson", "poisson"]: self.kernel = True self.rnaseq = True else: @@ -106,15 +106,16 @@ def load_data(self) -> pd.DataFrame: rank_metric = rank_metric.select_dtypes(include=[np.number]) else: raise Exception("Error parsing gene ranking values!") + + if rank_metric.isnull().any().sum() > 0: + self._logger.warning("Input data contains NA, filled NA with 0") + rank_metric = rank_metric.fillna(0) + if rank_metric.index.duplicated().sum() > 0: self._logger.warning( - "Dropping duplicated gene names, values averaged by gene names!" + "Found duplicated gene names, values averaged by gene names!" ) - rank_metric = rank_metric.loc[rank_metric.index.dropna()] rank_metric = rank_metric.groupby(level=0).mean() - if rank_metric.isnull().any().sum() > 0: - self._logger.warning("Input data contains NA, filled NA with 0") - rank_metric = rank_metric.fillna(0) return rank_metric @@ -125,13 +126,7 @@ def run(self): # load data df = self.load_data() if self.rnaseq: - self._logger.debug( - "Poisson kernel selected. round input values to intergers!" - ) - df = df.astype(int) - self._logger.debug( - "Poisson kernel selected. convert negative values to 0 !" - ) + self._logger.info("Poisson kernel selected. Clip negative values to 0 !") df = df.clip(lower=0) self.data = df