Skip to content

Commit

Permalink
move peptide number threshold out of the data_preprocessing to get th…
Browse files Browse the repository at this point in the history
…e accurate number of peptides used in the analysis
  • Loading branch information
byemaxx committed Oct 10, 2024
1 parent 349e3a6 commit 9060e6d
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 25 deletions.
5 changes: 5 additions & 0 deletions Docs/ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Version: 1.115.4
## Date: 2024-10-07
### Changes:
- TODO: use the peptide number for 'self.peptide_num_used' after filtering the minimum peptide number

# Version: 1.115.3
## Date: 2024-10-04
### Changes:
Expand Down
3 changes: 1 addition & 2 deletions metax/gui/main_gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -1857,7 +1857,6 @@ def run_after_set_multi_tables(self):
msg = f"""<html>
<body>
<p>Custom data is ready!</p>
<p>{nan_stats_str}</p>
<p>Number of items: [{num_item}]</p>
</body>
</html>
Expand Down Expand Up @@ -1890,7 +1889,7 @@ def run_after_set_multi_tables(self):
<table>
<tr>
<th>Category</th>
<th>Number</th>
<th>Number (After Filtering)</th>
<th>Used Peptides</th>
<th>% of All Peptides</th>
</tr>
Expand Down
92 changes: 82 additions & 10 deletions metax/taxafunc_analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,8 +689,7 @@ def run_lfq_for_taxa_func(self, df_taxa_func):

return df_taxa_func

def update_data_preprocess_parameters(self, data_preprocess_params, peptide_num_threshold):
data_preprocess_params['peptide_num_threshold'] = peptide_num_threshold
def update_data_preprocess_parameters(self, data_preprocess_params):

normalize_method = data_preprocess_params['normalize_method']
transform_method = data_preprocess_params['transform_method']
Expand All @@ -706,7 +705,67 @@ def update_data_preprocess_parameters(self, data_preprocess_params, peptide_num_


return data_preprocess_params

def filter_peptides_num_for_splited_func(self, df, peptide_num_threshold, df_type, distinct_threshold_mode=False):
'''
Only for the splited func table or taxa_func table
- df: the splited func table or taxa_func table which has been grouped, index is the func or taxa_func
- peptide_num_threshold: the threshold of peptide number for each func or taxa_func
- df_type: 'func' or 'taxa_func'
- distinct_threshold_mode: TODO
'''

valid_df_types = ['func', 'taxa_func']
if df_type not in valid_df_types:
raise ValueError(f"df_type must be one of {valid_df_types}, your input is [{df_type}]")

peptide_num= peptide_num_threshold[df_type]
df_original_len = len(df)

df = df[df['peptide_num'] >= peptide_num]
print(f"Removed [{df_original_len - len(df)} {df_type}] with less than [{peptide_num}] peptides.")
return df




def filter_peptides_num(self, df, peptide_num_threshold, df_type, distinct_threshold_mode=False):
'''
Filter the peptides based on the peptide number threshold
- df: the original df including peptides, taxa, and functions, etc.
- peptide_num_threshold: the threshold of peptide number for each taxa or func
- df_type: 'taxa', 'func', or 'taxa_func'
- distinct_threshold_mode: TODO
'''
valid_df_types = ['taxa', 'func', 'taxa_func']
if df_type not in valid_df_types:
raise ValueError(f"df_type must be one of {valid_df_types}, your input is [{df_type}]")

peptide_num= peptide_num_threshold[df_type]
df_original_len = len(df)

if df_type == 'taxa_func':
item_col = 'taxa_func'
df['taxa_func'] = df['Taxon'] + '&&&&' + df[self.func_name]
else:
item_col = 'Taxon' if df_type == 'taxa' else self.func_name

# Group by item_col and filter based on peptide number
dict_item_pep_num = df.groupby(item_col).size().to_dict()
remove_list = [k for k, v in dict_item_pep_num.items() if v < peptide_num]

# Remove rows based on peptide number threshold
df = df[~df[item_col].isin(remove_list)]

if df_type == 'taxa_func':
df = df.drop('taxa_func', axis=1)

self.peptide_num_used[df_type] = len(df)
print(f"Removed [{len(remove_list)} {df_type}] from [{df_original_len - len(df)} Peptides] with less than [{peptide_num}] peptides.")

return df



def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00,
outlier_params: dict = {'detect_method': None, 'handle_method': None,
Expand Down Expand Up @@ -751,7 +810,7 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00,

#! fllowing code is for the normal mode
# Update 'data_preprocess_params'
data_preprocess_params = self.update_data_preprocess_parameters(data_preprocess_params, peptide_num_threshold)
data_preprocess_params = self.update_data_preprocess_parameters(data_preprocess_params)

#2. sum the protein intensity
if sum_protein:
Expand Down Expand Up @@ -821,10 +880,15 @@ def strip_taxa(x, level):

if not taxa_and_func_only_from_otf:
# extract 'taxa', sample intensity #! and 'peptide_col' to avoid the duplicated items when handling outlier
df_taxa_pep = df_filtered_peptides[[self.peptide_col_name,'Taxon'] + self.sample_list]
df_taxa_pep = df_filtered_peptides[[self.peptide_col_name,'Taxon'] + self.sample_list] # type: ignore
print("\n-----Starting to perform outlier detection and handling for [Peptide-Taxon] table...-----")
df_taxa_pep = self.detect_and_handle_outliers(df=df_taxa_pep, **outlier_params)
self.peptide_num_used['taxa'] = len(df_taxa_pep)
#TODO: use the peptide number after filtering the minimum peptide number
# statastic the peptide number of each taxa
df_taxa_pep = self.filter_peptides_num(df=df_taxa_pep, peptide_num_threshold=peptide_num_threshold, df_type='taxa')


# self.peptide_num_used['taxa'] = len(df_taxa_pep)
# add column 'peptide_num' to df_taxa as 1
df_taxa_pep['peptide_num'] = 1

Expand All @@ -844,7 +908,8 @@ def strip_taxa(x, level):
df_func_pep = df_func_pep[[self.peptide_col_name, self.func_name] + self.sample_list]
print("\n-----Starting to perform outlier detection and handling for [Peptide-Function] table...-----")
df_func_pep = self.detect_and_handle_outliers(df=df_func_pep, **outlier_params)
self.peptide_num_used['func'] = len(df_func_pep)
if not split_func:
df_func_pep = self.filter_peptides_num(df=df_func_pep, peptide_num_threshold=peptide_num_threshold, df_type='func')
df_func_pep['peptide_num'] = 1

if quant_method == 'lfq':
Expand All @@ -853,8 +918,10 @@ def strip_taxa(x, level):
df_func = df_func_pep.groupby(self.func_name).sum(numeric_only=True)

if split_func:
self.peptide_num_used['func'] = len(df_func_pep)
df_func = self.split_func(df=df_func, split_func_params=split_func_params, df_type='func')

df_func = self.filter_peptides_num_for_splited_func(df=df_func, peptide_num_threshold=peptide_num_threshold, df_type='func')

df_func = self.data_preprocess(df=df_func,df_name = 'func', **data_preprocess_params)
self.func_df = df_func
#-----Func Table End-----
Expand Down Expand Up @@ -887,6 +954,8 @@ def strip_taxa(x, level):
# ----- create taxa_func table -----
df_taxa_func = df_half_processed_peptides[[self.peptide_col_name, 'Taxon', self.func_name] + self.sample_list]
df_taxa_func['peptide_num'] = 1
if not split_func:
df_taxa_func = self.filter_peptides_num(df=df_taxa_func, peptide_num_threshold=peptide_num_threshold, df_type='taxa_func')

for key in ['taxa_func', 'taxa', 'func']:
self.peptide_num_used[key] = len(df_taxa_func) if self.peptide_num_used[key] == 0 else self.peptide_num_used[key]
Expand All @@ -899,6 +968,9 @@ def strip_taxa(x, level):
# split the function before data preprocess
if split_func:
df_taxa_func = self.split_func( df=df_taxa_func, split_func_params=split_func_params, df_type='taxa_func')
df_taxa_func = self.filter_peptides_num_for_splited_func(df=df_taxa_func, peptide_num_threshold=peptide_num_threshold,
df_type='taxa_func')


print("\n-----Starting to perform data pre-processing for [Taxa-Function] table...-----")
df_taxa_func_all_processed = self.data_preprocess(df=df_taxa_func
Expand Down Expand Up @@ -1025,14 +1097,14 @@ def get_df(self, table_name:str = 'taxa'):
sw.set_func('KEGG_Pathway_name')
sw.set_group('Individual')
sw.set_multi_tables(level='s',
outlier_params = {'detect_method': 'zero-dominant', 'handle_method': 'original',
outlier_params = {'detect_method': 'None', 'handle_method': 'original',
"detection_by_group" : 'Individual', "handle_by_group": None},
data_preprocess_params = {
'normalize_method': 'trace_shift',
'normalize_method': 'None',
'transform_method': "log2",
'batch_meta': 'None',
'processing_order': ['transform', 'normalize', 'batch']},
peptide_num_threshold = {'taxa': 2, 'func': 2, 'taxa_func': 2},
peptide_num_threshold = {'taxa': 3, 'func': 3, 'taxa_func': 3},
keep_unknow_func=False, sum_protein=False,
sum_protein_params = {'method': 'razor', 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap', 'peptide_num_threshold': 3},
split_func=True, split_func_params = {'split_by': '|', 'share_intensity': False},
Expand Down
15 changes: 3 additions & 12 deletions metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ def detect_and_handle_outliers(self, df: pd.DataFrame,
def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None,
transform_method: str|None = None, batch_meta: str|None =None,
processing_order:list|None =None,
df_name:str = "None", peptide_num_threshold:dict[str, int] ={'taxa': 1, 'func': 1, 'taxa_func': 1}
df_name:str = "None"
) -> pd.DataFrame:
"""
## `data_preprocess` Method
Expand Down Expand Up @@ -662,11 +662,7 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None,
- `taxa_func`
- `protein`
- `custom`
- `peptide_num_threshold` (`dict`, optional):
The threshold for the number of peptides in each DataFrame. Default values are:
- `taxa`: 3
- `func`: 3
- `taxa_func`: 3
### Returns:
Expand All @@ -677,12 +673,7 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None,

df = df.copy()

# remove items with peptide number less than threshold
if df_name in ['taxa', 'func', 'taxa_func']:
print(f'{df_name.upper()} number before removing: {df.shape[0]}')
df = df[df['peptide_num'] >= peptide_num_threshold[df_name]]
print(f'{df_name.upper()} number with peptide_num >= [{peptide_num_threshold[df_name]}]: {df.shape[0]}')


if processing_order is None:
processing_order = ['transform', 'normalize', 'batch']
else:
Expand Down
2 changes: 1 addition & 1 deletion metax/utils/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version__ = '1.115.3'
__version__ = '1.115.4'
API_version = '3'

0 comments on commit 9060e6d

Please sign in to comment.