From c53a4c2796718e81afb270963805b63d45c455d9 Mon Sep 17 00:00:00 2001
From: Zhuoqing Fang <fangzq@KLG9H04WF3.local>
Date: Sun, 15 Dec 2024 22:05:57 -0800
Subject: [PATCH] when geneid is index, gene name and ranking values are
 columns #250

---
 gseapy/gsea.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/gseapy/gsea.py b/gseapy/gsea.py
index 80817c5..169423a 100644
--- a/gseapy/gsea.py
+++ b/gseapy/gsea.py
@@ -472,11 +472,15 @@ def _load_ranking(self, rank_metric: pd.DataFrame) -> pd.Series:
 
         """
         # load data
-        # sort ranking values from high to low
         rnk_cols = rank_metric.columns
+        # if case the input has more than two columns, only select the last two
+        # this happens when gene id is index, while gene name and ranking value are columns
+        if len(rnk_cols) > 2:
+            rank_metric = rank_metric.iloc[:, -2:]
+            rnk_cols = rank_metric.columns
         # if not ranking.is_monotonic_decreasing:
         #     ranking = ranking.sort_values(ascending=self.ascending)
-        rank_metric.sort_values(by=rnk_cols[1], ascending=self.ascending, inplace=True)
+        rank_metric.sort_values(by=rnk_cols[-1], ascending=self.ascending, inplace=True)
         # drop na values
         if rank_metric.isnull().any(axis=1).sum() > 0:
             self._logger.warning(
@@ -518,12 +522,6 @@ def load_ranking(self):
         parse rnk input
         """
         rank_metric = self._load_data(self.rnk)  # gene id is the first column
-        # only two column dataframe is accepted
-        if rank_metric.shape[1] > 2:
-            raise ValueError(
-                "Input gene rankings should be a two column dataframe, "
-                + "with the first column as gene names and the second column as prerank values."
-            )
         if rank_metric.select_dtypes(np.number).shape[1] == 1:
             # return series
             return self._load_ranking(rank_metric)