From e1ba6c420250fbe837db793045d567c5f6257efe Mon Sep 17 00:00:00 2001 From: Qing Date: Sun, 22 Sep 2024 18:53:47 -0400 Subject: [PATCH] - New: Added Mini peptides threshold for the Protein Suming part. - Fix: Fixed the index of peptide table lost after data preprossing. --- Docs/ChangeLog.md | 6 + metax/gui/main_gui.py | 6 +- metax/gui/metax_gui/main_window.ui | 346 ++++++++++-------- metax/gui/metax_gui/ui_main_window.py | 29 +- metax/taxafunc_analyzer/analyzer.py | 12 +- .../analyzer_utils/data_preprocessing.py | 3 +- .../analyzer_utils}/razor_sum.py | 88 ++++- .../analyzer_utils/sum_protein_intensity.py | 51 ++- metax/utils/version.py | 2 +- 9 files changed, 335 insertions(+), 208 deletions(-) rename metax/{utils/scripts => taxafunc_analyzer/analyzer_utils}/razor_sum.py (71%) diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md index e9fe29d..942f5b3 100644 --- a/Docs/ChangeLog.md +++ b/Docs/ChangeLog.md @@ -1,3 +1,9 @@ +# Version: 1.114.2 +## Date: 2024-09-22 +### Changes: +- New: Added Mini peptides threshold for the Protein Suming part. +- Fix: Fixed the index of peptide table lost after data preprossing. + # Version: 1.114.1 ## Date: 2024-09-19 diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py index 77ea594..1eee15f 100644 --- a/metax/gui/main_gui.py +++ b/metax/gui/main_gui.py @@ -980,8 +980,10 @@ def change_event_checkBox_create_protein_table(self): # self.checkBox_infrence_protein_by_sample.setEnabled(True) # self.comboBox_protein_ranking_method.setEnabled(True) self.comboBox_method_of_protein_inference.setEnabled(True) + self.spinBox_peptide_num_threshold_protein.setEnabled(True) else: self.comboBox_method_of_protein_inference.setEnabled(False) + self.spinBox_peptide_num_threshold_protein.setEnabled(False) self.checkBox_infrence_protein_by_sample.setEnabled(False) self.comboBox_protein_ranking_method.setEnabled(False) @@ -2146,7 +2148,6 @@ def show_pushButton_preprocessing_help(self): \n\nMultiple: Outliers will be imputed by using IterativeImputer with multiple imputations method. It uses the IterativeImputer with a specified number (K=5) of nearest features.\ \n\n\nData Normalization:\ \n\nIf you use [Z-Score, Mean centering and Pareto Scaling] data normalization, the data will be given a minimum offset again to avoid negative values.\ - \n\n\nBatch Effect Correction: only apply to peptide data. ''' msg_box.setText(help_text) msg_box.exec_() @@ -2589,7 +2590,8 @@ def set_multi_table(self, restore_taxafunc=False, saved_obj=None): 'method': self.comboBox_method_of_protein_inference.currentText(), 'by_sample': self.checkBox_infrence_protein_by_sample.isChecked(), 'rank_method' :self.comboBox_protein_ranking_method.currentText(), - 'greedy_method': self.settings.value('protein_infer_greedy_mode', 'heap') + 'greedy_method': self.settings.value('protein_infer_greedy_mode', 'heap'), + 'peptide_mun_threshold': self.spinBox_peptide_num_threshold_protein.value(), } diff --git a/metax/gui/metax_gui/main_window.ui b/metax/gui/metax_gui/main_window.ui index c9a215d..c683599 100644 --- a/metax/gui/metax_gui/main_window.ui +++ b/metax/gui/metax_gui/main_window.ui @@ -245,8 +245,8 @@ 0 0 - 391 - 80 + 528 + 530 @@ -574,6 +574,30 @@ + + + + + + Peptide Number Threshold of Protein + + + + + + + false + + + 1 + + + 999 + + + + + @@ -2722,7 +2746,7 @@ 0 0 - 621 + 999 150 @@ -3725,7 +3749,7 @@ 0 0 - 878 + 1020 128 @@ -5619,8 +5643,8 @@ 0 0 - 535 - 94 + 996 + 99 @@ -7400,8 +7424,8 @@ 0 0 - 620 - 65 + 1016 + 101 @@ -8109,8 +8133,8 @@ 0 0 - 775 - 102 + 1016 + 185 @@ -10216,7 +10240,7 @@ 0 0 1122 - 21 + 23 @@ -10349,12 +10373,12 @@ setEnabled(bool) - 663 - 114 + 100 + 81 - 836 - 116 + 109 + 81 @@ -10365,12 +10389,12 @@ setEnabled(bool) - 663 - 114 + 100 + 81 - 1042 - 116 + 117 + 82 @@ -10381,12 +10405,12 @@ setEnabled(bool) - 184 - 150 + 61 + 85 - 281 - 152 + 68 + 85 @@ -10397,12 +10421,12 @@ setEnabled(bool) - 184 - 150 + 61 + 85 - 397 - 152 + 76 + 87 @@ -10413,12 +10437,12 @@ setEnabled(bool) - 63 - 100 + 61 + 86 - 70 - 100 + 68 + 86 @@ -10429,12 +10453,12 @@ setEnabled(bool) - 63 - 100 + 61 + 86 - 77 - 101 + 75 + 87 @@ -10445,12 +10469,12 @@ setEnabled(bool) - 96 - 94 + 94 + 80 - 108 - 94 + 106 + 80 @@ -10461,12 +10485,12 @@ setEnabled(bool) - 96 - 94 + 94 + 80 - 119 - 95 + 117 + 81 @@ -10477,12 +10501,12 @@ setEnabled(bool) - 96 - 93 + 94 + 79 - 108 - 93 + 106 + 79 @@ -10493,12 +10517,12 @@ setEnabled(bool) - 96 - 93 + 94 + 79 - 119 - 94 + 117 + 80 @@ -10509,12 +10533,12 @@ setEnabled(bool) - 212 - 151 + 61 + 86 - 309 - 138 + 68 + 86 @@ -10525,12 +10549,12 @@ setEnabled(bool) - 212 - 151 + 61 + 86 - 405 - 140 + 75 + 87 @@ -10541,12 +10565,12 @@ setEnabled(bool) - 170 - 195 + 85 + 91 - 269 - 197 + 97 + 91 @@ -10557,12 +10581,12 @@ setEnabled(bool) - 95 - 95 + 93 + 81 - 108 - 95 + 106 + 81 @@ -10573,12 +10597,12 @@ setEnabled(bool) - 95 - 95 + 93 + 81 - 119 - 96 + 117 + 82 @@ -10589,12 +10613,12 @@ setEnabled(bool) - 96 - 94 + 94 + 80 - 108 - 94 + 106 + 80 @@ -10605,12 +10629,12 @@ setEnabled(bool) - 96 - 94 + 94 + 80 - 119 - 95 + 117 + 81 @@ -10621,12 +10645,12 @@ setEnabled(bool) - 1056 - 522 + 698 + 176 - 289 - 559 + 311 + 213 @@ -10637,12 +10661,12 @@ setEnabled(bool) - 1056 - 522 + 698 + 176 - 289 - 688 + 311 + 342 @@ -10653,12 +10677,12 @@ setEnabled(bool) - 1056 - 522 + 698 + 176 - 845 - 561 + 602 + 215 @@ -10669,12 +10693,12 @@ setEnabled(bool) - 170 - 195 + 85 + 91 - 480 - 197 + 108 + 92 @@ -10685,12 +10709,12 @@ setEnabled(bool) - 330 - 166 + 328 + 152 - 423 - 166 + 421 + 152 @@ -10701,12 +10725,12 @@ setDisabled(bool) - 373 - 608 + 212 + 180 - 406 - 115 + 74 + 81 @@ -10717,12 +10741,12 @@ setEnabled(bool) - 63 - 99 + 61 + 85 - 70 - 99 + 68 + 85 @@ -10733,12 +10757,12 @@ setEnabled(bool) - 63 - 99 + 61 + 85 - 227 - 100 + 225 + 86 @@ -10749,12 +10773,12 @@ setEnabled(bool) - 149 - 151 + 61 + 85 - 246 - 153 + 71 + 85 @@ -10765,12 +10789,12 @@ setEnabled(bool) - 149 - 151 + 61 + 85 - 449 - 153 + 80 + 86 @@ -10781,12 +10805,12 @@ setVisible(bool) - 76 - 436 + 61 + 93 - 134 - 512 + 119 + 96 @@ -10797,12 +10821,12 @@ setVisible(bool) - 105 - 518 + 75 + 98 - 106 - 539 + 117 + 100 @@ -10813,12 +10837,12 @@ setVisible(bool) - 123 - 541 + 66 + 108 - 113 - 712 + 100 + 97 @@ -10829,12 +10853,12 @@ setVisible(bool) - 137 - 269 + 75 + 120 - 155 - 284 + 109 + 100 @@ -10845,12 +10869,12 @@ setVisible(bool) - 77 - 112 + 75 + 98 - 121 - 114 + 119 + 100 @@ -10861,12 +10885,12 @@ setVisible(bool) - 66 - 543 + 50 + 106 - 134 - 620 + 119 + 101 @@ -10877,12 +10901,12 @@ setVisible(bool) - 133 - 444 + 75 + 103 - 162 - 709 + 119 + 100 @@ -10893,12 +10917,12 @@ setVisible(bool) - 55 - 112 + 53 + 98 - 121 - 114 + 119 + 100 @@ -10909,12 +10933,12 @@ setEnabled(bool) - 857 - 146 + 915 + 142 - 953 - 145 + 971 + 143 @@ -10925,8 +10949,8 @@ setEnabled(bool) - 832 - 146 + 915 + 142 1044 @@ -10941,12 +10965,12 @@ setDisabled(bool) - 1019 - 542 + 813 + 153 - 553 - 114 + 96 + 81 diff --git a/metax/gui/metax_gui/ui_main_window.py b/metax/gui/metax_gui/ui_main_window.py index 27afd67..40abdeb 100644 --- a/metax/gui/metax_gui/ui_main_window.py +++ b/metax/gui/metax_gui/ui_main_window.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Form implementation generated from reading ui file 'c:\Users\Qing\OneDrive - University of Ottawa\code\TaxaFunc\MetaX\metax\gui\metax_gui\main_window.ui' +# Form implementation generated from reading ui file 'c:\Users\max\OneDrive - University of Ottawa\code\TaxaFunc\MetaX\metax\gui\metax_gui\main_window.ui' # # Created by: PyQt5 UI code generator 5.15.9 # @@ -147,7 +147,7 @@ def setupUi(self, metaX_main): self.toolBox_2.setMaximumSize(QtCore.QSize(1677, 16777215)) self.toolBox_2.setObjectName("toolBox_2") self.page_2 = QtWidgets.QWidget() - self.page_2.setGeometry(QtCore.QRect(0, 0, 391, 80)) + self.page_2.setGeometry(QtCore.QRect(0, 0, 528, 530)) self.page_2.setObjectName("page_2") self.gridLayout_27 = QtWidgets.QGridLayout(self.page_2) self.gridLayout_27.setObjectName("gridLayout_27") @@ -317,6 +317,18 @@ def setupUi(self, metaX_main): self.checkBox_infrence_protein_by_sample.setEnabled(False) self.checkBox_infrence_protein_by_sample.setObjectName("checkBox_infrence_protein_by_sample") self.gridLayout_37.addWidget(self.checkBox_infrence_protein_by_sample, 1, 1, 1, 1) + self.horizontalLayout_93 = QtWidgets.QHBoxLayout() + self.horizontalLayout_93.setObjectName("horizontalLayout_93") + self.label_24 = QtWidgets.QLabel(self.tab_set_taxa_func) + self.label_24.setObjectName("label_24") + self.horizontalLayout_93.addWidget(self.label_24) + self.spinBox_peptide_num_threshold_protein = QtWidgets.QSpinBox(self.tab_set_taxa_func) + self.spinBox_peptide_num_threshold_protein.setEnabled(False) + self.spinBox_peptide_num_threshold_protein.setMinimum(1) + self.spinBox_peptide_num_threshold_protein.setMaximum(999) + self.spinBox_peptide_num_threshold_protein.setObjectName("spinBox_peptide_num_threshold_protein") + self.horizontalLayout_93.addWidget(self.spinBox_peptide_num_threshold_protein) + self.gridLayout_37.addLayout(self.horizontalLayout_93, 1, 0, 1, 1) self.gridLayout_25.addLayout(self.gridLayout_37, 4, 0, 1, 1) self.label_134 = QtWidgets.QLabel(self.tab_set_taxa_func) sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Preferred, QtWidgets.QSizePolicy.Maximum) @@ -1384,7 +1396,7 @@ def setupUi(self, metaX_main): self.scrollArea_2.setWidgetResizable(True) self.scrollArea_2.setObjectName("scrollArea_2") self.scrollAreaWidgetContents_2 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_2.setGeometry(QtCore.QRect(0, 0, 621, 150)) + self.scrollAreaWidgetContents_2.setGeometry(QtCore.QRect(0, 0, 999, 150)) self.scrollAreaWidgetContents_2.setObjectName("scrollAreaWidgetContents_2") self.gridLayout_50 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_2) self.gridLayout_50.setObjectName("gridLayout_50") @@ -1933,7 +1945,7 @@ def setupUi(self, metaX_main): self.scrollArea_cross_heatmap_settings.setWidgetResizable(True) self.scrollArea_cross_heatmap_settings.setObjectName("scrollArea_cross_heatmap_settings") self.scrollAreaWidgetContents_3 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_3.setGeometry(QtCore.QRect(0, 0, 878, 128)) + self.scrollAreaWidgetContents_3.setGeometry(QtCore.QRect(0, 0, 1020, 128)) self.scrollAreaWidgetContents_3.setObjectName("scrollAreaWidgetContents_3") self.gridLayout_38 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_3) self.gridLayout_38.setObjectName("gridLayout_38") @@ -2867,7 +2879,7 @@ def setupUi(self, metaX_main): self.scrollArea_3.setWidgetResizable(True) self.scrollArea_3.setObjectName("scrollArea_3") self.scrollAreaWidgetContents_4 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_4.setGeometry(QtCore.QRect(0, 0, 535, 94)) + self.scrollAreaWidgetContents_4.setGeometry(QtCore.QRect(0, 0, 996, 99)) self.scrollAreaWidgetContents_4.setObjectName("scrollAreaWidgetContents_4") self.gridLayout_68 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_4) self.gridLayout_68.setObjectName("gridLayout_68") @@ -3813,7 +3825,7 @@ def setupUi(self, metaX_main): self.scrollArea_5.setWidgetResizable(True) self.scrollArea_5.setObjectName("scrollArea_5") self.scrollAreaWidgetContents_6 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_6.setGeometry(QtCore.QRect(0, 0, 620, 65)) + self.scrollAreaWidgetContents_6.setGeometry(QtCore.QRect(0, 0, 1016, 101)) self.scrollAreaWidgetContents_6.setObjectName("scrollAreaWidgetContents_6") self.gridLayout_57 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_6) self.gridLayout_57.setObjectName("gridLayout_57") @@ -4194,7 +4206,7 @@ def setupUi(self, metaX_main): self.scrollArea_6.setWidgetResizable(True) self.scrollArea_6.setObjectName("scrollArea_6") self.scrollAreaWidgetContents_7 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_7.setGeometry(QtCore.QRect(0, 0, 775, 102)) + self.scrollAreaWidgetContents_7.setGeometry(QtCore.QRect(0, 0, 1016, 185)) self.scrollAreaWidgetContents_7.setObjectName("scrollAreaWidgetContents_7") self.gridLayout_69 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_7) self.gridLayout_69.setObjectName("gridLayout_69") @@ -5333,7 +5345,7 @@ def setupUi(self, metaX_main): self.statusbar.setObjectName("statusbar") metaX_main.setStatusBar(self.statusbar) self.menuBar = QtWidgets.QMenuBar(metaX_main) - self.menuBar.setGeometry(QtCore.QRect(0, 0, 1122, 21)) + self.menuBar.setGeometry(QtCore.QRect(0, 0, 1122, 23)) self.menuBar.setObjectName("menuBar") self.menuTools = QtWidgets.QMenu(self.menuBar) self.menuTools.setObjectName("menuTools") @@ -5493,6 +5505,7 @@ def retranslateUi(self, metaX_main): self.comboBox_protein_ranking_method.setItemText(2, _translate("metaX_main", "unique_intensity")) self.comboBox_protein_ranking_method.setItemText(3, _translate("metaX_main", "shared_intensity")) self.checkBox_infrence_protein_by_sample.setText(_translate("metaX_main", "Inference by each Sample")) + self.label_24.setText(_translate("metaX_main", "Peptide Number Threshold of Protein")) self.label_134.setText(_translate("metaX_main", "Sum Proteins Intensity")) self.pushButton_set_multi_table.setText(_translate("metaX_main", "GO")) self.label_39.setText(_translate("metaX_main", "Data Preprocessing")) diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py index d82530a..054b4a7 100644 --- a/metax/taxafunc_analyzer/analyzer.py +++ b/metax/taxafunc_analyzer/analyzer.py @@ -671,6 +671,7 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap', + 'peptide_mun_threshold': 1 }, keep_unknow_func: bool = False, split_func: bool = False, split_func_params: dict = {'split_by': '|', 'share_intensity': False}, @@ -684,7 +685,9 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, data_preprocess_params = {'normalize_method': None, 'transform_method': "log10", 'batch_meta': "Individual", 'processing_order': ['outlier', 'transform', 'normalize', 'batch']}, peptide_num_threshold = {'taxa': 3, 'func': 3, 'taxa_func': 3}, - sum_protein = False, sum_protein_params = {'method': 'razor', 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap'}, + sum_protein = False, sum_protein_params = {'method': 'razor', 'by_sample': False, + 'rank_method': 'unique_counts', 'greedy_method': 'heap', + 'peptide_num_threshold': 3}, keep_unknow_func = False) """ print(f"Original data shape: {self.original_df.shape}") @@ -814,7 +817,8 @@ def strip_taxa(x, level): self.processed_original_df = self.data_preprocess(df=df_half_processed_peptides[[self.peptide_col_name, 'Taxon', self.func_name] + self.sample_list], df_name = 'peptide', **data_preprocess_params) # processed_original_df is the peptide table after selected taxa level, func_threshold, outlier detection and handling, then do the rest of data preprocess - self.peptide_df = self.processed_original_df.drop([self.peptide_col_name, 'Taxon', self.func_name], axis=1) + self.peptide_df = self.processed_original_df.drop(['Taxon', self.func_name], axis=1) + self.peptide_df = self.peptide_df.set_index(self.peptide_col_name) ###------Peptide Table End------### @@ -934,6 +938,10 @@ def get_df(self, table_name:str = 'taxa'): # remove peptide_num column if exists if "peptide_num" in dft.columns: dft = dft.drop(columns="peptide_num") + + if table_name in ['protein', 'proteins']: + dft = dft.drop(columns='peptides') + return dft diff --git a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py index eaea80b..e946265 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py +++ b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py @@ -670,7 +670,6 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None, df = df.copy() - original_row_num = len(df) # remove items with peptide number less than threshold if df_name in ['taxa', 'func', 'taxa_func']: @@ -697,7 +696,7 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None, if df_name in {'peptide', 'taxa', 'func', 'taxa_func', 'protein', 'custom'}: left_row_num = len(df) # self.tfa.outlier_status[df_name] = f'{left_row_num}/{original_row_num} ({left_row_num/original_row_num*100:.2f}%)' - self.tfa.outlier_status[df_name] = f'{left_row_num} ({left_row_num/original_row_num*100:.2f}% of the data before outlier handling)' + self.tfa.outlier_status[df_name] = f'{left_row_num} ({left_row_num/len(self.tfa.original_df)*100:.2f}%)' return df \ No newline at end of file diff --git a/metax/utils/scripts/razor_sum.py b/metax/taxafunc_analyzer/analyzer_utils/razor_sum.py similarity index 71% rename from metax/utils/scripts/razor_sum.py rename to metax/taxafunc_analyzer/analyzer_utils/razor_sum.py index 3a809d8..6c9aa04 100644 --- a/metax/utils/scripts/razor_sum.py +++ b/metax/taxafunc_analyzer/analyzer_utils/razor_sum.py @@ -4,43 +4,91 @@ class RazorSum: - def __init__(self, df, column_map): + def __init__(self, df, column_map, peptide_mun_threshold=1, + greedy_method = 'greedy', share_intensity=False, protein_separator=';'): self.df = df self.column_map = column_map + self.greedy_method = greedy_method + self.peptide_mun_threshold = peptide_mun_threshold # the protein must have at least 3 peptides to be considered as a target + self.share_intensity = share_intensity + self.protein_separator = protein_separator + self.res_intensity_dict = {} # store all sample to output - self.greedy_method = None # only used for razor method self.mini_target_set = None self.filtered_target_to_peptides = None - self.share_intensity = False self.__multi_target_count = 0 - - + + def sum_protein_intensity(self, greedy_method='heap'): self.greedy_method = greedy_method print('Start to sum protein intensity using method: [razor]') - if column_map['sample_list'] is None or len(column_map['sample_list']) == 0: + if self.column_map['sample_list'] is None or len(self.column_map['sample_list']) == 0: raise ValueError('Please provide [sample_list] in column_map for sum, e.g. ["Sample1", "Sample2", "Sample3"]') # only extract the peptide and target columns extract_cols = [self.column_map['peptide'], self.column_map['target']] + self.column_map['sample_list'] self.df = self.df.loc[:, extract_cols] pep_to_target = self._create_pep_to_target_razor() + + self._sum_target_intensity(pep_to_target) # show summary print(f"Total peptides count: {len(pep_to_target)}") - self.__multi_target_count = self.__multi_target_count/len(sample_list) + # calculate the mean of the multi-target peptides of each sample + self.__multi_target_count = self.__multi_target_count/len(self.column_map['sample_list']) print(f"Multi-target peptides count: {self.__multi_target_count} ({self.__multi_target_count / len(pep_to_target) * 100:.2f}%)") res_df = pd.DataFrame.from_dict(self.res_intensity_dict) res_df.fillna(0, inplace=True) - res_df.index.name = 'Target' + res_df.index.name = self.column_map['target'] + + #add a column of all peptide of the protein + res_df['peptides'] = res_df.index.map(lambda x: ';'.join(self.filtered_target_to_peptides[x])) + # add a column of the peptide number of the protein + res_df['peptide_num'] = res_df.index.map(lambda x: len(self.filtered_target_to_peptides[x])) + + # move teh 2 columns to the front + res_df = res_df[['peptides', 'peptide_num'] + [col for col in res_df.columns if col not in ['peptides', 'peptide_num']]] print('Finish summing protein intensity') return res_df + + def remove_protein_less_than_threshold(self,): + ''' + Remove the proteins with less than threshold peptides in `self.df` + ''' + if self.peptide_mun_threshold <= 1: + print(f"Peptide threshold is [{self.peptide_mun_threshold}], no protein will be removed") + return self.df + + # calculate the number of peptides for each protein + # remove the proteins with less than threshold peptides in df in the protein column not + def remove_proteins(proteins): + proteins_list = proteins.split(self.protein_separator) + proteins_list = [protein for protein in proteins_list if protein not in proteins_less_than_threshold] + return self.protein_separator.join(proteins_list) + + target_to_peptides = self._create_target_to_peptides() + + print(f"Remove proteins with less than [{self.peptide_mun_threshold}] peptides, then the peptide with NA protein will be removed") + print(f"Orignal Protein number: [{len(target_to_peptides)}], Peptide number: [{len(self.df)}]") + proteins_less_than_threshold = [target for target, peps in target_to_peptides.items() if len(peps) < self.peptide_mun_threshold] + + + df = self.df.copy() + df[self.column_map['target']] = df[self.column_map['target']].apply(remove_proteins) + + # remove the rows with NA protein of sellf.df + self.df = self.df[df[self.column_map['target']] != ''] + # print The number of proteins and peptides after removing the proteins with less than threshold peptides + print(f"After removing, Protein number: [{len(target_to_peptides) - len(proteins_less_than_threshold)}], Peptide number: [{len(self.df)}]") + + return self.df + def get_mini_target_set(self, greedy_method='heap'): self.greedy_method = greedy_method print('Start to get minimum target set using method: [razor]') @@ -51,8 +99,10 @@ def get_mini_target_set(self, greedy_method='heap'): raise ValueError(f'NA or empty value in target column: {self.column_map["target"]}') self.df = self.df.loc[:, extract_cols] - df = self.df.loc[:, [self.column_map['peptide'], self.column_map['target']]] - peptides = set(df[self.column_map['peptide']]) + + self.remove_protein_less_than_threshold() + + peptides = set(self.df[self.column_map['peptide']]) target_to_peptides = self._create_target_to_peptides() mini_target_set = self.find_minimum_target_set(peptides, target_to_peptides) filtered_target_to_peptides = {target: target_to_peptides[target] for target in mini_target_set} @@ -95,7 +145,7 @@ def _create_target_to_peptides(self): for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Creating target to peptides mapping"): sequence = row[self.column_map['peptide']] - targets = row[self.column_map['target']].split(';') + targets = row[self.column_map['target']].split(self.protein_separator) for target in targets: target_to_peptides[target].add(sequence) @@ -195,9 +245,13 @@ def _update_output_dict(self, target_list, sample_name, intensity): # Example usage: # Assuming df is your pandas dataframe and column_map is your dictionary if __name__ == '__main__': - df = pd.read_csv('OTF.tsv', sep='\t') - df_meta = pd.read_csv('meta.txt', sep='\t') - sample_list = df_meta['Samples'].unique().tolist() + import os + current_path = os.path.dirname(os.path.abspath(__file__)) + df_path = os.path.join(current_path, '../../data/example_data/Example_OTF.tsv') + meta_path = os.path.join(current_path, '../../data/example_data/Example_Meta.tsv') + df = pd.read_csv(df_path, sep='\t') + df_meta = pd.read_csv(meta_path, sep='\t') + sample_list = df_meta['Sample'].unique().tolist() sample_list = ["Intensity_" + sample for sample in sample_list] column_map = { @@ -205,11 +259,11 @@ def _update_output_dict(self, target_list, sample_name, intensity): 'target': 'Proteins', 'sample_list': sample_list # ['Sample1', 'Sample2', 'Sample3'] } - sia = RazorSum(df, column_map) + sia = RazorSum(df, column_map, peptide_mun_threshold=3) res_df = sia.sum_protein_intensity(greedy_method='heap') - res_df.to_csv('razor_protein_intensity.tsv', sep='\t') + # res_df.to_csv('razor_protein_intensity.tsv', sep='\t') # or get minimum target set only - # mini_target_set = sia.get_mini_target_set(greedy_method='heap') + mini_target_set = sia.get_mini_target_set(greedy_method='heap') diff --git a/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py b/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py index 6bfafdd..11fef21 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py +++ b/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py @@ -16,10 +16,11 @@ from collections import defaultdict import pandas as pd from tqdm import tqdm +from .razor_sum import RazorSum class SumProteinIntensity: - def __init__(self, taxa_func_analyzer, df=None): + def __init__(self, taxa_func_analyzer, df=None, peptide_mun_threshold=1, protein_separator=';'): self.tfa = taxa_func_analyzer self.res_intensity_dict = {} # store all sample to output self.rank_dict = {} # store the rank of protein intensity for each sample temporarily @@ -30,7 +31,9 @@ def __init__(self, taxa_func_analyzer, df=None): self.greedy_method = None # only used for razor method self.share_intensity = False self.__multi_target_count = 0 - + self.peptide_mun_threshold = peptide_mun_threshold # the protein must have at least 3 peptides to be considered as a target + self.protein_separator = protein_separator + def check_protein_col(self): # if any NA, '', or empty in the protein column, raise error @@ -40,7 +43,7 @@ def check_protein_col(self): if (self.df[self.tfa.protein_col_name].str.strip() == '').any(): raise ValueError(f'There are empty values in {self.tfa.protein_col_name} column') - def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='unique_counts', greedy_method='heap'): + def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='unique_counts', greedy_method='heap', peptide_mun_threshold=None): ''' method: str, default 'razor' options: ['razor', 'anti-razor', 'rank'] @@ -57,13 +60,28 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un raise ValueError('Rank method must in ["shared_intensity", "all_counts", "unique_counts", "unique_intensity"]') if greedy_method not in ['greedy', 'heap']: raise ValueError('Greedy method must in ["greedy", "heap"]') + if peptide_mun_threshold is not None: + self.peptide_mun_threshold = peptide_mun_threshold + + # remove the protein with less than the threshold of peptides + # use teh methood in RazorSum + razor_integrator = RazorSum(df=self.df, + column_map={ + 'peptide': self.tfa.peptide_col_name, + 'target': self.tfa.protein_col_name, + 'sample_list': self.tfa.sample_list, + }, + peptide_mun_threshold=self.peptide_mun_threshold, + share_intensity=self.share_intensity, + greedy_method=greedy_method, + protein_separator= self.protein_separator) self.rank_method = rank_method - self.greedy_method = greedy_method self.check_protein_col() if method == 'rank': print(f"\n-------------Start to sum protein intensity using method: [{method}] by_sample: [{by_sample}] rank_method: [{rank_method}]-------------") + self.df = razor_integrator.remove_protein_less_than_threshold() # make a dict to count the intensity of each protein, intensity sahred by peptides will be divided by the number of peptides if by_sample: for sample in self.tfa.sample_list: @@ -82,14 +100,13 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un self._sum_protein_rank(sample, by_sample) elif method == 'razor': print('start to sum protein intensity using method: [razor]') - # use Set Cover Problem to get the protein list, then sum the intensity - pep_to_protein = self._create_pep_to_protein_razor() - self._sum_protein_razor(pep_to_protein) - self.__multi_target_count = self.__multi_target_count/len(self.tfa.sample_list) - print(f'Peptides with multiple targets: {self.__multi_target_count} ({self.__multi_target_count/len(pep_to_protein)*100:.2f}%)') + + res_df = razor_integrator.sum_protein_intensity(greedy_method=greedy_method) + return res_df elif method == 'anti-razor': - print(f"\n-------------Start to sum protein intensity using method: [{method}] by_sample: [True] rank_method: [Shared]-------------") + print(f"\n-------------Start to sum protein intensity using method: [{method}] by_sample: [True] rank_method: [Shared]-------------") + self.df = razor_integrator.remove_protein_less_than_threshold() for sample in self.tfa.sample_list: self._sum_protein_anti_razor(sample) @@ -188,9 +205,10 @@ def _create_pep_to_protein_razor(self) -> dict: protein_to_peptides = defaultdict(set) peptides = set(df[self.tfa.peptide_col_name]) + separator = self.protein_separator for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Creating protein to peptides mapping"): sequence = row[self.tfa.peptide_col_name] - proteins = row[self.tfa.protein_col_name].split(';') + proteins = row[self.tfa.protein_col_name].split(separator) for protein in proteins: protein_to_peptides[protein].add(sequence) @@ -235,8 +253,10 @@ def _init_dicts(self): def _update_protein_rank_dict(self, sample_name = None, rank_method = None): def update_by_intesity(df, sample_name=sample_name, method=rank_method): + separator = self.protein_separator + for row in df.itertuples(): - proteins = row[1].split(';') + proteins = row[1].split(separator) shared_times = len(proteins) if method == 'shared_intensity': @@ -305,9 +325,10 @@ def _sum_protein_rank(self, sample_name:str, by_sample=False): # print in one line print(f'Asigning protein intensity for [{sample_name}]', end='\r') df = self.df.loc[:,[ self.tfa.protein_col_name, sample_name]] + separator = self.protein_separator for row in df.itertuples(): - proteins = row[1].split(';') + proteins = row[1].split(separator) intensity = row[2] if len(proteins) == 1: self._update_output_dict(proteins, sample_name, intensity) @@ -331,8 +352,8 @@ def _sum_protein_anti_razor(self, sample_name:str): print(f'Creating protein intensity dict for [{sample_name}]', end='\r') df = self.df.loc[:,[ self.tfa.protein_col_name, sample_name]] self.share_intensity = True - + separator = self.protein_separator for row in df.itertuples(): - proteins = row[1].split(';') + proteins = row[1].split(separator) intensity = row[2] self._update_output_dict(proteins, sample_name, intensity) \ No newline at end of file diff --git a/metax/utils/version.py b/metax/utils/version.py index 2ad705f..43029aa 100644 --- a/metax/utils/version.py +++ b/metax/utils/version.py @@ -1,2 +1,2 @@ -__version__ = '1.114.1' +__version__ = '1.114.2' API_version = '2' \ No newline at end of file