From 9060e6d8204c3c712bd46a57bc5ff9d259548012 Mon Sep 17 00:00:00 2001
From: Qing <44231502+byemaxx@users.noreply.github.com>
Date: Thu, 10 Oct 2024 15:18:46 -0400
Subject: [PATCH] move peptide number threshold out of the data_preprocessing
to get the accurate number of peptides used in the analysis
---
Docs/ChangeLog.md | 5 +
metax/gui/main_gui.py | 3 +-
metax/taxafunc_analyzer/analyzer.py | 92 +++++++++++++++++--
.../analyzer_utils/data_preprocessing.py | 15 +--
metax/utils/version.py | 2 +-
5 files changed, 92 insertions(+), 25 deletions(-)
diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md
index 05e1ae3..3c07c5b 100644
--- a/Docs/ChangeLog.md
+++ b/Docs/ChangeLog.md
@@ -1,3 +1,8 @@
+# Version: 1.115.4
+## Date: 2024-10-07
+### Changes:
+- TODO: use the peptide number for 'self.peptide_num_used' after filtering the minimum peptide number
+
# Version: 1.115.3
## Date: 2024-10-04
### Changes:
diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py
index 10cbe18..73d4479 100644
--- a/metax/gui/main_gui.py
+++ b/metax/gui/main_gui.py
@@ -1857,7 +1857,6 @@ def run_after_set_multi_tables(self):
msg = f"""
Custom data is ready!
- {nan_stats_str}
Number of items: [{num_item}]
@@ -1890,7 +1889,7 @@ def run_after_set_multi_tables(self):
Category |
- Number |
+ Number (After Filtering) |
Used Peptides |
% of All Peptides |
diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py
index 24e37ff..b21e774 100644
--- a/metax/taxafunc_analyzer/analyzer.py
+++ b/metax/taxafunc_analyzer/analyzer.py
@@ -689,8 +689,7 @@ def run_lfq_for_taxa_func(self, df_taxa_func):
return df_taxa_func
- def update_data_preprocess_parameters(self, data_preprocess_params, peptide_num_threshold):
- data_preprocess_params['peptide_num_threshold'] = peptide_num_threshold
+ def update_data_preprocess_parameters(self, data_preprocess_params):
normalize_method = data_preprocess_params['normalize_method']
transform_method = data_preprocess_params['transform_method']
@@ -706,7 +705,67 @@ def update_data_preprocess_parameters(self, data_preprocess_params, peptide_num_
return data_preprocess_params
+
+ def filter_peptides_num_for_splited_func(self, df, peptide_num_threshold, df_type, distinct_threshold_mode=False):
+ '''
+ Only for the splited func table or taxa_func table
+ - df: the splited func table or taxa_func table which has been grouped, index is the func or taxa_func
+ - peptide_num_threshold: the threshold of peptide number for each func or taxa_func
+ - df_type: 'func' or 'taxa_func'
+ - distinct_threshold_mode: TODO
+ '''
+
+ valid_df_types = ['func', 'taxa_func']
+ if df_type not in valid_df_types:
+ raise ValueError(f"df_type must be one of {valid_df_types}, your input is [{df_type}]")
+
+ peptide_num= peptide_num_threshold[df_type]
+ df_original_len = len(df)
+
+ df = df[df['peptide_num'] >= peptide_num]
+ print(f"Removed [{df_original_len - len(df)} {df_type}] with less than [{peptide_num}] peptides.")
+ return df
+
+
+
+ def filter_peptides_num(self, df, peptide_num_threshold, df_type, distinct_threshold_mode=False):
+ '''
+ Filter the peptides based on the peptide number threshold
+ - df: the original df including peptides, taxa, and functions, etc.
+ - peptide_num_threshold: the threshold of peptide number for each taxa or func
+ - df_type: 'taxa', 'func', or 'taxa_func'
+ - distinct_threshold_mode: TODO
+ '''
+ valid_df_types = ['taxa', 'func', 'taxa_func']
+ if df_type not in valid_df_types:
+ raise ValueError(f"df_type must be one of {valid_df_types}, your input is [{df_type}]")
+
+ peptide_num= peptide_num_threshold[df_type]
+ df_original_len = len(df)
+
+ if df_type == 'taxa_func':
+ item_col = 'taxa_func'
+ df['taxa_func'] = df['Taxon'] + '&&&&' + df[self.func_name]
+ else:
+ item_col = 'Taxon' if df_type == 'taxa' else self.func_name
+
+ # Group by item_col and filter based on peptide number
+ dict_item_pep_num = df.groupby(item_col).size().to_dict()
+ remove_list = [k for k, v in dict_item_pep_num.items() if v < peptide_num]
+
+ # Remove rows based on peptide number threshold
+ df = df[~df[item_col].isin(remove_list)]
+
+ if df_type == 'taxa_func':
+ df = df.drop('taxa_func', axis=1)
+
+ self.peptide_num_used[df_type] = len(df)
+ print(f"Removed [{len(remove_list)} {df_type}] from [{df_original_len - len(df)} Peptides] with less than [{peptide_num}] peptides.")
+
+ return df
+
+
def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00,
outlier_params: dict = {'detect_method': None, 'handle_method': None,
@@ -751,7 +810,7 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00,
#! fllowing code is for the normal mode
# Update 'data_preprocess_params'
- data_preprocess_params = self.update_data_preprocess_parameters(data_preprocess_params, peptide_num_threshold)
+ data_preprocess_params = self.update_data_preprocess_parameters(data_preprocess_params)
#2. sum the protein intensity
if sum_protein:
@@ -821,10 +880,15 @@ def strip_taxa(x, level):
if not taxa_and_func_only_from_otf:
# extract 'taxa', sample intensity #! and 'peptide_col' to avoid the duplicated items when handling outlier
- df_taxa_pep = df_filtered_peptides[[self.peptide_col_name,'Taxon'] + self.sample_list]
+ df_taxa_pep = df_filtered_peptides[[self.peptide_col_name,'Taxon'] + self.sample_list] # type: ignore
print("\n-----Starting to perform outlier detection and handling for [Peptide-Taxon] table...-----")
df_taxa_pep = self.detect_and_handle_outliers(df=df_taxa_pep, **outlier_params)
- self.peptide_num_used['taxa'] = len(df_taxa_pep)
+ #TODO: use the peptide number after filtering the minimum peptide number
+ # statastic the peptide number of each taxa
+ df_taxa_pep = self.filter_peptides_num(df=df_taxa_pep, peptide_num_threshold=peptide_num_threshold, df_type='taxa')
+
+
+ # self.peptide_num_used['taxa'] = len(df_taxa_pep)
# add column 'peptide_num' to df_taxa as 1
df_taxa_pep['peptide_num'] = 1
@@ -844,7 +908,8 @@ def strip_taxa(x, level):
df_func_pep = df_func_pep[[self.peptide_col_name, self.func_name] + self.sample_list]
print("\n-----Starting to perform outlier detection and handling for [Peptide-Function] table...-----")
df_func_pep = self.detect_and_handle_outliers(df=df_func_pep, **outlier_params)
- self.peptide_num_used['func'] = len(df_func_pep)
+ if not split_func:
+ df_func_pep = self.filter_peptides_num(df=df_func_pep, peptide_num_threshold=peptide_num_threshold, df_type='func')
df_func_pep['peptide_num'] = 1
if quant_method == 'lfq':
@@ -853,8 +918,10 @@ def strip_taxa(x, level):
df_func = df_func_pep.groupby(self.func_name).sum(numeric_only=True)
if split_func:
+ self.peptide_num_used['func'] = len(df_func_pep)
df_func = self.split_func(df=df_func, split_func_params=split_func_params, df_type='func')
-
+ df_func = self.filter_peptides_num_for_splited_func(df=df_func, peptide_num_threshold=peptide_num_threshold, df_type='func')
+
df_func = self.data_preprocess(df=df_func,df_name = 'func', **data_preprocess_params)
self.func_df = df_func
#-----Func Table End-----
@@ -887,6 +954,8 @@ def strip_taxa(x, level):
# ----- create taxa_func table -----
df_taxa_func = df_half_processed_peptides[[self.peptide_col_name, 'Taxon', self.func_name] + self.sample_list]
df_taxa_func['peptide_num'] = 1
+ if not split_func:
+ df_taxa_func = self.filter_peptides_num(df=df_taxa_func, peptide_num_threshold=peptide_num_threshold, df_type='taxa_func')
for key in ['taxa_func', 'taxa', 'func']:
self.peptide_num_used[key] = len(df_taxa_func) if self.peptide_num_used[key] == 0 else self.peptide_num_used[key]
@@ -899,6 +968,9 @@ def strip_taxa(x, level):
# split the function before data preprocess
if split_func:
df_taxa_func = self.split_func( df=df_taxa_func, split_func_params=split_func_params, df_type='taxa_func')
+ df_taxa_func = self.filter_peptides_num_for_splited_func(df=df_taxa_func, peptide_num_threshold=peptide_num_threshold,
+ df_type='taxa_func')
+
print("\n-----Starting to perform data pre-processing for [Taxa-Function] table...-----")
df_taxa_func_all_processed = self.data_preprocess(df=df_taxa_func
@@ -1025,14 +1097,14 @@ def get_df(self, table_name:str = 'taxa'):
sw.set_func('KEGG_Pathway_name')
sw.set_group('Individual')
sw.set_multi_tables(level='s',
- outlier_params = {'detect_method': 'zero-dominant', 'handle_method': 'original',
+ outlier_params = {'detect_method': 'None', 'handle_method': 'original',
"detection_by_group" : 'Individual', "handle_by_group": None},
data_preprocess_params = {
- 'normalize_method': 'trace_shift',
+ 'normalize_method': 'None',
'transform_method': "log2",
'batch_meta': 'None',
'processing_order': ['transform', 'normalize', 'batch']},
- peptide_num_threshold = {'taxa': 2, 'func': 2, 'taxa_func': 2},
+ peptide_num_threshold = {'taxa': 3, 'func': 3, 'taxa_func': 3},
keep_unknow_func=False, sum_protein=False,
sum_protein_params = {'method': 'razor', 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap', 'peptide_num_threshold': 3},
split_func=True, split_func_params = {'split_by': '|', 'share_intensity': False},
diff --git a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py
index fef5f33..a7ab723 100644
--- a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py
+++ b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py
@@ -614,7 +614,7 @@ def detect_and_handle_outliers(self, df: pd.DataFrame,
def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None,
transform_method: str|None = None, batch_meta: str|None =None,
processing_order:list|None =None,
- df_name:str = "None", peptide_num_threshold:dict[str, int] ={'taxa': 1, 'func': 1, 'taxa_func': 1}
+ df_name:str = "None"
) -> pd.DataFrame:
"""
## `data_preprocess` Method
@@ -662,11 +662,7 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None,
- `taxa_func`
- `protein`
- `custom`
- - `peptide_num_threshold` (`dict`, optional):
- The threshold for the number of peptides in each DataFrame. Default values are:
- - `taxa`: 3
- - `func`: 3
- - `taxa_func`: 3
+
### Returns:
@@ -677,12 +673,7 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None,
df = df.copy()
- # remove items with peptide number less than threshold
- if df_name in ['taxa', 'func', 'taxa_func']:
- print(f'{df_name.upper()} number before removing: {df.shape[0]}')
- df = df[df['peptide_num'] >= peptide_num_threshold[df_name]]
- print(f'{df_name.upper()} number with peptide_num >= [{peptide_num_threshold[df_name]}]: {df.shape[0]}')
-
+
if processing_order is None:
processing_order = ['transform', 'normalize', 'batch']
else:
diff --git a/metax/utils/version.py b/metax/utils/version.py
index 06dc858..b0d8e7d 100644
--- a/metax/utils/version.py
+++ b/metax/utils/version.py
@@ -1,2 +1,2 @@
-__version__ = '1.115.3'
+__version__ = '1.115.4'
API_version = '3'
\ No newline at end of file