From 9060e6d8204c3c712bd46a57bc5ff9d259548012 Mon Sep 17 00:00:00 2001
From: Qing <44231502+byemaxx@users.noreply.github.com>
Date: Thu, 10 Oct 2024 15:18:46 -0400
Subject: [PATCH] move peptide number threshold out of the data_preprocessing
 to get the accurate number of peptides used in the analysis

---
 Docs/ChangeLog.md                             |  5 +
 metax/gui/main_gui.py                         |  3 +-
 metax/taxafunc_analyzer/analyzer.py           | 92 +++++++++++++++++--
 .../analyzer_utils/data_preprocessing.py      | 15 +--
 metax/utils/version.py                        |  2 +-
 5 files changed, 92 insertions(+), 25 deletions(-)
diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md
index 05e1ae3..3c07c5b 100644
--- a/Docs/ChangeLog.md
+++ b/Docs/ChangeLog.md
@@ -1,3 +1,8 @@
+# Version: 1.115.4
+## Date: 2024-10-07
+### Changes:
+- TODO: use the peptide number for 'self.peptide_num_used' after filtering the minimum peptide number
+
 # Version: 1.115.3
 ## Date: 2024-10-04
 ### Changes:
diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py
index 10cbe18..73d4479 100644
--- a/metax/gui/main_gui.py
+++ b/metax/gui/main_gui.py
@@ -1857,7 +1857,6 @@ def run_after_set_multi_tables(self):
             msg = f"""<html>
             <body>
             <p>Custom data is ready!</p>
-            <p>{nan_stats_str}</p>
             <p>Number of items: [{num_item}]</p>
             </body>
             </html>
@@ -1890,7 +1889,7 @@ def run_after_set_multi_tables(self):
                 <table>
                     <tr>
                         <th>Category</th>
-                        <th>Number</th>
+                        <th>Number (After Filtering)</th>
                         <th>Used Peptides</th>
                         <th>% of All Peptides</th>
                     </tr>
diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py
index 24e37ff..b21e774 100644
--- a/metax/taxafunc_analyzer/analyzer.py
+++ b/metax/taxafunc_analyzer/analyzer.py
@@ -689,8 +689,7 @@ def run_lfq_for_taxa_func(self, df_taxa_func):
         
         return df_taxa_func
     
-    def update_data_preprocess_parameters(self, data_preprocess_params, peptide_num_threshold):
-        data_preprocess_params['peptide_num_threshold'] = peptide_num_threshold
+    def update_data_preprocess_parameters(self, data_preprocess_params):
         
         normalize_method = data_preprocess_params['normalize_method']
         transform_method = data_preprocess_params['transform_method']
@@ -706,7 +705,67 @@ def update_data_preprocess_parameters(self, data_preprocess_params, peptide_num_
                 
         
         return data_preprocess_params
+
+    def filter_peptides_num_for_splited_func(self, df, peptide_num_threshold, df_type, distinct_threshold_mode=False):
+        '''
+        Only for the splited func table or taxa_func table
+        - df: the splited func table or taxa_func table which has been grouped, index is the func or taxa_func
+        - peptide_num_threshold: the threshold of peptide number for each func or taxa_func
+        - df_type: 'func' or 'taxa_func'
+        - distinct_threshold_mode: TODO
+        '''
+        
+        valid_df_types = ['func', 'taxa_func']
+        if df_type not in valid_df_types:
+            raise ValueError(f"df_type must be one of {valid_df_types}, your input is [{df_type}]")
+        
+        peptide_num= peptide_num_threshold[df_type]
+        df_original_len = len(df)
+        
+        df = df[df['peptide_num'] >= peptide_num]
+        print(f"Removed [{df_original_len - len(df)} {df_type}] with less than [{peptide_num}] peptides.")
+        return df    
+    
+
+        
         
+    def filter_peptides_num(self, df, peptide_num_threshold, df_type, distinct_threshold_mode=False):
+        '''
+        Filter the peptides based on the peptide number threshold
+        - df: the original df including peptides, taxa, and functions, etc.
+        - peptide_num_threshold: the threshold of peptide number for each taxa or func
+        - df_type: 'taxa', 'func', or 'taxa_func'
+        - distinct_threshold_mode: TODO
+        '''
+        valid_df_types = ['taxa', 'func', 'taxa_func']
+        if df_type not in valid_df_types:
+            raise ValueError(f"df_type must be one of {valid_df_types}, your input is [{df_type}]")
+        
+        peptide_num= peptide_num_threshold[df_type]
+        df_original_len = len(df)
+
+        if df_type == 'taxa_func':
+            item_col = 'taxa_func'
+            df['taxa_func'] = df['Taxon'] + '&&&&' + df[self.func_name]
+        else:
+            item_col = 'Taxon' if df_type == 'taxa' else self.func_name
+
+        # Group by item_col and filter based on peptide number
+        dict_item_pep_num = df.groupby(item_col).size().to_dict()
+        remove_list = [k for k, v in dict_item_pep_num.items() if v < peptide_num]
+
+        # Remove rows based on peptide number threshold
+        df = df[~df[item_col].isin(remove_list)]
+
+        if df_type == 'taxa_func':
+            df = df.drop('taxa_func', axis=1)
+
+        self.peptide_num_used[df_type] = len(df)
+        print(f"Removed [{len(remove_list)} {df_type}] from [{df_original_len - len(df)} Peptides] with less than [{peptide_num}] peptides.")
+
+        return df
+
+
             
     def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00,
                          outlier_params: dict = {'detect_method': None, 'handle_method': None,
@@ -751,7 +810,7 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00,
 
         #! fllowing code is for the normal mode
         # Update 'data_preprocess_params'
-        data_preprocess_params = self.update_data_preprocess_parameters(data_preprocess_params, peptide_num_threshold)
+        data_preprocess_params = self.update_data_preprocess_parameters(data_preprocess_params)
         
         #2. sum the protein intensity
         if sum_protein:
@@ -821,10 +880,15 @@ def strip_taxa(x, level):
         
         if not taxa_and_func_only_from_otf:
             # extract 'taxa', sample intensity #! and 'peptide_col' to avoid the duplicated items when handling outlier
-            df_taxa_pep = df_filtered_peptides[[self.peptide_col_name,'Taxon'] + self.sample_list]
+            df_taxa_pep = df_filtered_peptides[[self.peptide_col_name,'Taxon'] + self.sample_list] # type: ignore
             print("\n-----Starting to perform outlier detection and handling for [Peptide-Taxon] table...-----")
             df_taxa_pep = self.detect_and_handle_outliers(df=df_taxa_pep, **outlier_params)
-            self.peptide_num_used['taxa'] = len(df_taxa_pep)
+            #TODO: use the peptide number after filtering the minimum peptide number 
+            # statastic the peptide number of each taxa
+            df_taxa_pep = self.filter_peptides_num(df=df_taxa_pep, peptide_num_threshold=peptide_num_threshold, df_type='taxa')
+            
+            
+            # self.peptide_num_used['taxa'] = len(df_taxa_pep)
             # add column 'peptide_num' to df_taxa as 1
             df_taxa_pep['peptide_num'] = 1
             
@@ -844,7 +908,8 @@ def strip_taxa(x, level):
             df_func_pep = df_func_pep[[self.peptide_col_name, self.func_name] + self.sample_list]
             print("\n-----Starting to perform outlier detection and handling for [Peptide-Function] table...-----")
             df_func_pep = self.detect_and_handle_outliers(df=df_func_pep, **outlier_params)
-            self.peptide_num_used['func'] = len(df_func_pep)
+            if not split_func:
+                df_func_pep = self.filter_peptides_num(df=df_func_pep, peptide_num_threshold=peptide_num_threshold, df_type='func')
             df_func_pep['peptide_num'] = 1
             
             if quant_method == 'lfq':
@@ -853,8 +918,10 @@ def strip_taxa(x, level):
                 df_func = df_func_pep.groupby(self.func_name).sum(numeric_only=True)
             
             if split_func:
+                self.peptide_num_used['func'] = len(df_func_pep)
                 df_func = self.split_func(df=df_func, split_func_params=split_func_params, df_type='func')
-                
+                df_func = self.filter_peptides_num_for_splited_func(df=df_func, peptide_num_threshold=peptide_num_threshold, df_type='func')
+
             df_func = self.data_preprocess(df=df_func,df_name = 'func', **data_preprocess_params)
             self.func_df = df_func
             #-----Func Table End-----
@@ -887,6 +954,8 @@ def strip_taxa(x, level):
         # ----- create taxa_func table -----
         df_taxa_func = df_half_processed_peptides[[self.peptide_col_name, 'Taxon', self.func_name] + self.sample_list]
         df_taxa_func['peptide_num'] = 1
+        if not split_func:
+            df_taxa_func = self.filter_peptides_num(df=df_taxa_func, peptide_num_threshold=peptide_num_threshold, df_type='taxa_func')
         
         for key in ['taxa_func', 'taxa', 'func']:
             self.peptide_num_used[key] = len(df_taxa_func) if self.peptide_num_used[key] == 0 else self.peptide_num_used[key]
@@ -899,6 +968,9 @@ def strip_taxa(x, level):
         # split the function before data preprocess
         if split_func:
             df_taxa_func = self.split_func( df=df_taxa_func, split_func_params=split_func_params, df_type='taxa_func')
+            df_taxa_func = self.filter_peptides_num_for_splited_func(df=df_taxa_func, peptide_num_threshold=peptide_num_threshold, 
+                                                                     df_type='taxa_func')
+            
             
         print("\n-----Starting to perform data pre-processing for [Taxa-Function] table...-----")
         df_taxa_func_all_processed = self.data_preprocess(df=df_taxa_func
@@ -1025,14 +1097,14 @@ def get_df(self, table_name:str = 'taxa'):
     sw.set_func('KEGG_Pathway_name')
     sw.set_group('Individual')
     sw.set_multi_tables(level='s', 
-                        outlier_params = {'detect_method': 'zero-dominant', 'handle_method': 'original',
+                        outlier_params = {'detect_method': 'None', 'handle_method': 'original',
                             "detection_by_group" : 'Individual', "handle_by_group": None},
                         data_preprocess_params = {
-                                                'normalize_method': 'trace_shift', 
+                                                'normalize_method': 'None', 
                                                 'transform_method': "log2",
                                                 'batch_meta': 'None', 
                                                 'processing_order': ['transform', 'normalize', 'batch']},
-                    peptide_num_threshold = {'taxa': 2, 'func': 2, 'taxa_func': 2},
+                    peptide_num_threshold = {'taxa': 3, 'func': 3, 'taxa_func': 3},
                     keep_unknow_func=False, sum_protein=False, 
                     sum_protein_params = {'method': 'razor', 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap', 'peptide_num_threshold': 3},
                     split_func=True, split_func_params = {'split_by': '|', 'share_intensity': False},
diff --git a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py
index fef5f33..a7ab723 100644
--- a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py
+++ b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py
@@ -614,7 +614,7 @@ def detect_and_handle_outliers(self, df: pd.DataFrame,
     def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None, 
                          transform_method: str|None = None, batch_meta: str|None =None,
                          processing_order:list|None =None,
-                         df_name:str = "None", peptide_num_threshold:dict[str, int] ={'taxa': 1, 'func': 1, 'taxa_func': 1}
+                         df_name:str = "None"
                          ) -> pd.DataFrame:
         """
         ## `data_preprocess` Method
@@ -662,11 +662,7 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None,
             - `taxa_func`
             - `protein`
             - `custom`
-        - `peptide_num_threshold` (`dict`, optional):
-        The threshold for the number of peptides in each DataFrame. Default values are:
-        - `taxa`: 3
-        - `func`: 3
-        - `taxa_func`: 3
+
         
         ### Returns:
 
@@ -677,12 +673,7 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None,
         
         df = df.copy()
         
-        # remove items with peptide number less than threshold
-        if df_name in ['taxa', 'func', 'taxa_func']:
-            print(f'{df_name.upper()} number before removing: {df.shape[0]}')
-            df = df[df['peptide_num'] >= peptide_num_threshold[df_name]]
-            print(f'{df_name.upper()} number with peptide_num >= [{peptide_num_threshold[df_name]}]: {df.shape[0]}')
-           
+
         if processing_order is None:
             processing_order = ['transform', 'normalize', 'batch']
         else:
diff --git a/metax/utils/version.py b/metax/utils/version.py
index 06dc858..b0d8e7d 100644
--- a/metax/utils/version.py
+++ b/metax/utils/version.py
@@ -1,2 +1,2 @@
-__version__ = '1.115.3'
+__version__ = '1.115.4'
 API_version = '3'
\ No newline at end of file