diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md index 942f5b3..1739fa1 100644 --- a/Docs/ChangeLog.md +++ b/Docs/ChangeLog.md @@ -1,3 +1,14 @@ +# Version: 1.114.4 +## Date: 2024-09-23 +### Changes: +- Fix: Fixed the bug of create taxa table and function table from the OTF only,the option didn't work correctly. +- Change: Changed the message of the result of creating Taxon-Function Table. + +# Version: 1.114.3 +## Date: 2024-09-22 +### Changes: +- Fix: Fixed the razor method of sum peptide to protein, the intensity was not stable in different runs. + # Version: 1.114.2 ## Date: 2024-09-22 ### Changes: diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py index 1eee15f..a452c65 100644 --- a/metax/gui/main_gui.py +++ b/metax/gui/main_gui.py @@ -1852,46 +1852,102 @@ def run_after_set_multi_tables(self): # Final message - outlier_detect_method = self.comboBox_outlier_detection.currentText() - - if outlier_detect_method != 'None': - nan_stats_str = '\n\nLeft row after data preprocessing:\n' - for i, j in self.tfa.outlier_status.items(): - if i not in ['peptide', 'custom']: - continue - if j: - nan_stats_str += f'{i}: [{j}]\n' - # print(nan_stats_str) - else: - nan_stats_str = '' - if self.tfa.any_df_mode: num_item = self.tfa.custom_df.shape[0] - msg = f'Custom data is ready! \ - \n{nan_stats_str}\ - \n\nNumber of item: [{num_item}]' + msg = f""" + +

Custom data is ready!

+

{nan_stats_str}

+

Number of items: [{num_item}]

+ + + """ else: - msg = f'Operational Taxa-Functions (OTF) data is ready! \ - \n{nan_stats_str}\ - \n\nFunction: [{self.tfa.func_name}]\ - \nNumber of peptide: [{num_peptide} ({num_peptide/self.tfa.original_df.shape[0]*100:.2f}% of all peptides)]\ - \nNumber of function: [{num_func}]\ - \nNumber of taxa: [{num_taxa}]\ - \nNumber of taxa-function: [{num_taxa_func}]\ - \nNumber of protein: [{num_protein}]' - - print(f'\n----Multi Table Result----\n{msg}\n---------------------------\n') - self.logger.write_log(msg.replace('\n', '')) - QMessageBox.information(self.MainWindow, 'Information', msg ) - - print("\n---------------------------------- Set Multi Table End ----------------------------------\n") - # go to basic analysis tab and the first tab - self.stackedWidget.setCurrentIndex(0) # go to page_analyzer - self.tabWidget_TaxaFuncAnalyzer.setCurrentIndex(3) - self.tabWidget_4.setCurrentIndex(0) - self.pushButton_set_multi_table.setEnabled(True) - - + original_num_peptide = self.tfa.original_df.shape[0] + + msg = f""" + + + + +

Operational Taxa-Functions (OTF) data is ready!

+

Taxa Level: {self.tfa.taxa_level}

+

Function Category: {self.tfa.func_name}

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + """ + + # add protein number if protein df is not None + if num_protein != 'NA': + msg += f""" + + + + + + """ + + # close the HTML + msg += """ +
CategoryNumberUsed Peptides% of All Peptides
Taxa{num_taxa}{self.tfa.peptide_num_used["taxa"]}{self.tfa.peptide_num_used["taxa"] / original_num_peptide * 100:.2f}%
Functions{num_func}{self.tfa.peptide_num_used["func"]}{self.tfa.peptide_num_used["func"] / original_num_peptide * 100:.2f}%
OTFs{num_taxa_func}{self.tfa.peptide_num_used["taxa_func"]}{self.tfa.peptide_num_used["taxa_func"] / original_num_peptide * 100:.2f}%
Clean Peptides{num_peptide}-{num_peptide / original_num_peptide * 100:.2f}%
Proteins{num_protein}{self.tfa.peptide_num_used["protein"]}{self.tfa.peptide_num_used["protein"] / original_num_peptide * 100:.2f}%
+ + """ + + msg_for_print = f''' + Taxa Level: {self.tfa.taxa_level} + Function Category: {self.tfa.func_name} + Number of Taxa: {num_taxa} (Peptides Used: {self.tfa.peptide_num_used["taxa"]}) + Number of Functions: {num_func} (Peptides Used: {self.tfa.peptide_num_used["func"]}) + Number of OTFs: {num_taxa_func} (Peptides Used: {self.tfa.peptide_num_used["taxa_func"]}) + Number of Peptides: {num_peptide} ({num_peptide / original_num_peptide * 100:.2f}%) + ''' + + print(f'\n----Multi Table Result----\n{msg_for_print}\n---------------------------\n') + self.logger.write_log(msg_for_print.replace('\n', '')) + QMessageBox.information(self.MainWindow, 'Result', msg) + ## Database builder by own Table def show_toolButton_db_own_anno_help(self): diff --git a/metax/gui/metax_gui/main_window.ui b/metax/gui/metax_gui/main_window.ui index c683599..263deb5 100644 --- a/metax/gui/metax_gui/main_window.ui +++ b/metax/gui/metax_gui/main_window.ui @@ -246,7 +246,7 @@ 0 0 528 - 530 + 534 @@ -1385,7 +1385,7 @@ Create Taxa and Functions only from OTFs - true + false @@ -5644,7 +5644,7 @@ 0 0 996 - 99 + 103 @@ -7425,7 +7425,7 @@ 0 0 1016 - 101 + 105 @@ -10240,7 +10240,7 @@ 0 0 1122 - 23 + 21 diff --git a/metax/gui/metax_gui/ui_main_window.py b/metax/gui/metax_gui/ui_main_window.py index 40abdeb..38cdc2a 100644 --- a/metax/gui/metax_gui/ui_main_window.py +++ b/metax/gui/metax_gui/ui_main_window.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Form implementation generated from reading ui file 'c:\Users\max\OneDrive - University of Ottawa\code\TaxaFunc\MetaX\metax\gui\metax_gui\main_window.ui' +# Form implementation generated from reading ui file 'c:\Users\Qing\OneDrive - University of Ottawa\code\TaxaFunc\MetaX\metax\gui\metax_gui\main_window.ui' # # Created by: PyQt5 UI code generator 5.15.9 # @@ -147,7 +147,7 @@ def setupUi(self, metaX_main): self.toolBox_2.setMaximumSize(QtCore.QSize(1677, 16777215)) self.toolBox_2.setObjectName("toolBox_2") self.page_2 = QtWidgets.QWidget() - self.page_2.setGeometry(QtCore.QRect(0, 0, 528, 530)) + self.page_2.setGeometry(QtCore.QRect(0, 0, 528, 534)) self.page_2.setObjectName("page_2") self.gridLayout_27 = QtWidgets.QGridLayout(self.page_2) self.gridLayout_27.setObjectName("gridLayout_27") @@ -682,7 +682,7 @@ def setupUi(self, metaX_main): self.gridLayout_17.addLayout(self.horizontalLayout, 1, 3, 1, 1) self.checkBox_set_otf_taxa_and_func_only_from_otf = QtWidgets.QCheckBox(self.tab_set_taxa_func) self.checkBox_set_otf_taxa_and_func_only_from_otf.setStatusTip("") - self.checkBox_set_otf_taxa_and_func_only_from_otf.setChecked(True) + self.checkBox_set_otf_taxa_and_func_only_from_otf.setChecked(False) self.checkBox_set_otf_taxa_and_func_only_from_otf.setObjectName("checkBox_set_otf_taxa_and_func_only_from_otf") self.gridLayout_17.addWidget(self.checkBox_set_otf_taxa_and_func_only_from_otf, 1, 4, 1, 1) self.gridLayout_25.addLayout(self.gridLayout_17, 1, 0, 1, 1) @@ -2879,7 +2879,7 @@ def setupUi(self, metaX_main): self.scrollArea_3.setWidgetResizable(True) self.scrollArea_3.setObjectName("scrollArea_3") self.scrollAreaWidgetContents_4 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_4.setGeometry(QtCore.QRect(0, 0, 996, 99)) + self.scrollAreaWidgetContents_4.setGeometry(QtCore.QRect(0, 0, 996, 103)) self.scrollAreaWidgetContents_4.setObjectName("scrollAreaWidgetContents_4") self.gridLayout_68 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_4) self.gridLayout_68.setObjectName("gridLayout_68") @@ -3825,7 +3825,7 @@ def setupUi(self, metaX_main): self.scrollArea_5.setWidgetResizable(True) self.scrollArea_5.setObjectName("scrollArea_5") self.scrollAreaWidgetContents_6 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_6.setGeometry(QtCore.QRect(0, 0, 1016, 101)) + self.scrollAreaWidgetContents_6.setGeometry(QtCore.QRect(0, 0, 1016, 105)) self.scrollAreaWidgetContents_6.setObjectName("scrollAreaWidgetContents_6") self.gridLayout_57 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_6) self.gridLayout_57.setObjectName("gridLayout_57") @@ -5345,7 +5345,7 @@ def setupUi(self, metaX_main): self.statusbar.setObjectName("statusbar") metaX_main.setStatusBar(self.statusbar) self.menuBar = QtWidgets.QMenuBar(metaX_main) - self.menuBar.setGeometry(QtCore.QRect(0, 0, 1122, 23)) + self.menuBar.setGeometry(QtCore.QRect(0, 0, 1122, 21)) self.menuBar.setObjectName("menuBar") self.menuTools = QtWidgets.QMenu(self.menuBar) self.menuTools.setObjectName("menuTools") diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py index 054b4a7..29248c4 100644 --- a/metax/taxafunc_analyzer/analyzer.py +++ b/metax/taxafunc_analyzer/analyzer.py @@ -74,8 +74,7 @@ def __init__( self.protein_df: Optional[pd.DataFrame] = None self.any_df_mode = any_df_mode # if True, the consider the TaxaFunc df as other_df self.custom_df: Optional[pd.DataFrame] = None # other df, any df that user want to add - self.outlier_status = {'peptide': None, 'taxa': None, 'func': None, - 'taxa_func': None, 'protein': None, 'custom': None} + self.peptide_num_used = {'taxa': 0, 'func': 0, 'taxa_func': 0, 'protein': 0} self.split_func_status:bool = False self.split_func_sep:str = '' @@ -498,7 +497,6 @@ def set_any_df_table(self, data_preprocess_params: dict = {'normalize_method': None, 'transform_method': None, 'batch_meta': None, 'processing_order': None}): df = self.original_df.copy() - self.outlier_status['custom'] = None # reset outlier_status df =self.detect_and_handle_outliers(df=df, **outlier_params) df = self.data_preprocess(df=df,df_name = 'custom', **data_preprocess_params) # set index as first column @@ -699,7 +697,6 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, return #! fllowing code is for the normal mode - # add 'peptide_num_threshold' to 'data_preprocess_params data_preprocess_params['peptide_num_threshold'] = peptide_num_threshold @@ -707,14 +704,15 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, if sum_protein: # data preprocess for peptide table print("---Starting to create protein table---") + self.peptide_num_used['protein'] = 0 df_peptide_for_protein = self.detect_and_handle_outliers(df=self.original_df, **outlier_params) self.protein_df = SumProteinIntensity(taxa_func_analyzer=self, df=df_peptide_for_protein).sum_protein_intensity( **sum_protein_params) self.protein_df = self.data_preprocess(df=self.protein_df,df_name = 'protein', **data_preprocess_params) - - # reset outlier_status - self.outlier_status = {'peptide': None, 'taxa': None, 'func': None, 'taxa_func': None} + for df_name in ['taxa', 'func', 'taxa_func']: + self.peptide_num_used[df_name] = 0 # reset the peptide_num_used + # reset split_func status self.split_func_status = split_func self.split_func_sep = split_func_params['split_by'] @@ -767,27 +765,28 @@ def strip_taxa(x, level): raise ValueError("Please input the correct taxa level (m, s, g, f, o, c, p, d, l)") - # extract 'taxa', sample intensity #! and 'peptide_num' fto avoid the duplicated items when handling outlier - df_taxa_pep = df_filtered_peptides[[self.peptide_col_name,'Taxon'] + self.sample_list] - # add column 'peptide_num' to df_taxa as 1 - df_taxa_pep['peptide_num'] = 1 - - # if taxa_and_func_only_from_otf: - if True: # for testing + if not taxa_and_func_only_from_otf: + # extract 'taxa', sample intensity #! and 'peptide_col' to avoid the duplicated items when handling outlier + df_taxa_pep = df_filtered_peptides[[self.peptide_col_name,'Taxon'] + self.sample_list] + # add column 'peptide_num' to df_taxa as 1 + df_taxa_pep['peptide_num'] = 1 # groupby 'Taxon' and sum the sample intensity print("\n-----Starting to perform outlier detection and handling for [Peptide-Taxon] table...-----") df_taxa_pep = self.detect_and_handle_outliers(df=df_taxa_pep, **outlier_params) + self.peptide_num_used['taxa'] = len(df_taxa_pep) df_taxa = df_taxa_pep.groupby('Taxon').sum(numeric_only=True) print("\n-----Starting to perform data pre-processing for Taxa table...-----") df_taxa = self.data_preprocess(df=df_taxa,df_name = 'taxa', **data_preprocess_params) self.taxa_df = df_taxa #-----Taxa Table End----- + # create func table df_func_pep = self.filter_peptides_by_taxa_func(df= self.original_df, func_threshold=func_threshold, keep_unknow_func=keep_unknow_func, filter_taxa=False) df_func_pep = df_func_pep[[self.peptide_col_name, self.func_name] + self.sample_list] print("\n-----Starting to perform outlier detection and handling for [Peptide-Function] table...-----") df_func_pep = self.detect_and_handle_outliers(df=df_func_pep, **outlier_params) + self.peptide_num_used['func'] = len(df_func_pep) df_func_pep['peptide_num'] = 1 df_func = df_func_pep.groupby(self.func_name).sum(numeric_only=True) @@ -826,6 +825,11 @@ def strip_taxa(x, level): # ----- create taxa_func table ----- df_taxa_func = df_half_processed_peptides[[self.peptide_col_name, 'Taxon', self.func_name] + self.sample_list] df_taxa_func['peptide_num'] = 1 + + for key in ['taxa_func', 'taxa', 'func']: + self.peptide_num_used[key] = len(df_taxa_func) if self.peptide_num_used[key] == 0 else self.peptide_num_used[key] + + df_taxa_func = df_taxa_func.groupby(['Taxon', self.func_name], as_index=True).sum(numeric_only=True) # split the function before data preprocess diff --git a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py index e946265..9df4db8 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py +++ b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py @@ -693,10 +693,6 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None, raise ValueError('processing_order must be in [outlier, batch, transform, normalize]') print(f'\n{self._get_current_time()} -----Data preprocessing of {df_name.upper()} finished.-----\n') - if df_name in {'peptide', 'taxa', 'func', 'taxa_func', 'protein', 'custom'}: - left_row_num = len(df) - # self.tfa.outlier_status[df_name] = f'{left_row_num}/{original_row_num} ({left_row_num/original_row_num*100:.2f}%)' - self.tfa.outlier_status[df_name] = f'{left_row_num} ({left_row_num/len(self.tfa.original_df)*100:.2f}%)' - + return df \ No newline at end of file diff --git a/metax/taxafunc_analyzer/analyzer_utils/razor_sum.py b/metax/taxafunc_analyzer/analyzer_utils/razor_sum.py index 6c9aa04..7de67f0 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/razor_sum.py +++ b/metax/taxafunc_analyzer/analyzer_utils/razor_sum.py @@ -20,6 +20,12 @@ def __init__(self, df, column_map, peptide_mun_threshold=1, def sum_protein_intensity(self, greedy_method='heap'): + # reset the results to avoid the influence of previous results + self.res_intensity_dict = {} # + self.__multi_target_count = 0 + self.mini_target_set = None + self.filtered_target_to_peptides = None + self.greedy_method = greedy_method print('Start to sum protein intensity using method: [razor]') if self.column_map['sample_list'] is None or len(self.column_map['sample_list']) == 0: @@ -35,7 +41,7 @@ def sum_protein_intensity(self, greedy_method='heap'): # show summary print(f"Total peptides count: {len(pep_to_target)}") - # calculate the mean of the multi-target peptides of each sample + # calculate the the multi-target peptides self.__multi_target_count = self.__multi_target_count/len(self.column_map['sample_list']) print(f"Multi-target peptides count: {self.__multi_target_count} ({self.__multi_target_count / len(pep_to_target) * 100:.2f}%)") @@ -80,8 +86,10 @@ def remove_proteins(proteins): df = self.df.copy() - df[self.column_map['target']] = df[self.column_map['target']].apply(remove_proteins) + tqdm.pandas(desc="Removing proteins") + df[self.column_map['target']] = df[self.column_map['target']].progress_apply(remove_proteins) + # remove the rows with NA protein of sellf.df self.df = self.df[df[self.column_map['target']] != ''] # print The number of proteins and peptides after removing the proteins with less than threshold peptides @@ -102,7 +110,8 @@ def get_mini_target_set(self, greedy_method='heap'): self.remove_protein_less_than_threshold() - peptides = set(self.df[self.column_map['peptide']]) + # peptides = set(self.df[self.column_map['peptide']]) + peptides = list(dict.fromkeys(self.df[self.column_map['peptide']])) target_to_peptides = self._create_target_to_peptides() mini_target_set = self.find_minimum_target_set(peptides, target_to_peptides) filtered_target_to_peptides = {target: target_to_peptides[target] for target in mini_target_set} @@ -121,15 +130,19 @@ def _create_pep_to_target_razor(self): """ self.get_mini_target_set(self.greedy_method) - peptides = set(self.df[self.column_map['peptide']]) + # keep the order of the peptides + peptides = list(dict.fromkeys(self.df[self.column_map['peptide']])) filtered_target_to_peptides = self.filtered_target_to_peptides peptide_to_target = defaultdict(list) for peptide in tqdm(peptides, desc="Assigning peptides to targets"): - possible_targets = [target for target, peps in filtered_target_to_peptides.items() if peptide in peps] + # possible_targets = [target for target, peps in filtered_target_to_peptides.items() if peptide in peps] + possible_targets = sorted([target for target, peps in filtered_target_to_peptides.items() if peptide in peps]) + if possible_targets: max_target_count = max(len(filtered_target_to_peptides[target]) for target in possible_targets) - best_targets = [target for target in possible_targets if len(filtered_target_to_peptides[target]) == max_target_count] + # best_targets = [target for target in possible_targets if len(filtered_target_to_peptides[target]) == max_target_count] + best_targets = sorted([target for target in possible_targets if len(filtered_target_to_peptides[target]) == max_target_count]) peptide_to_target[peptide].extend(best_targets) return peptide_to_target @@ -230,6 +243,7 @@ def _update_output_dict(self, target_list, sample_name, intensity): self.res_intensity_dict.setdefault(sample_name, {}).setdefault(target, 0) self.res_intensity_dict[sample_name][target] += intensity else: + target_list = sorted(target_list) if self.share_intensity: intensity /= len(target_list) for target in target_list: @@ -261,9 +275,10 @@ def _update_output_dict(self, target_list, sample_name, intensity): } sia = RazorSum(df, column_map, peptide_mun_threshold=3) - res_df = sia.sum_protein_intensity(greedy_method='heap') + res_df = sia.sum_protein_intensity(greedy_method='greedy') + # res_df.to_csv('razor_protein_intensity.tsv', sep='\t') # or get minimum target set only - mini_target_set = sia.get_mini_target_set(greedy_method='heap') + # mini_target_set = sia.get_mini_target_set(greedy_method='heap') diff --git a/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py b/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py index 11fef21..9aaa11a 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py +++ b/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py @@ -79,9 +79,11 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un self.rank_method = rank_method self.check_protein_col() + self.df = razor_integrator.remove_protein_less_than_threshold() + self.tfa.peptide_num_used['protein'] = len(self.df) + if method == 'rank': print(f"\n-------------Start to sum protein intensity using method: [{method}] by_sample: [{by_sample}] rank_method: [{rank_method}]-------------") - self.df = razor_integrator.remove_protein_less_than_threshold() # make a dict to count the intensity of each protein, intensity sahred by peptides will be divided by the number of peptides if by_sample: for sample in self.tfa.sample_list: @@ -100,13 +102,12 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un self._sum_protein_rank(sample, by_sample) elif method == 'razor': print('start to sum protein intensity using method: [razor]') - + razor_integrator.peptide_mun_threshold = 1 # set the threshold to 1, to avoid run filter again res_df = razor_integrator.sum_protein_intensity(greedy_method=greedy_method) return res_df elif method == 'anti-razor': print(f"\n-------------Start to sum protein intensity using method: [{method}] by_sample: [True] rank_method: [Shared]-------------") - self.df = razor_integrator.remove_protein_less_than_threshold() for sample in self.tfa.sample_list: self._sum_protein_anti_razor(sample) diff --git a/metax/utils/version.py b/metax/utils/version.py index 43029aa..6812e9a 100644 --- a/metax/utils/version.py +++ b/metax/utils/version.py @@ -1,2 +1,2 @@ -__version__ = '1.114.2' +__version__ = '1.114.4' API_version = '2' \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 4353e18..0cf48bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "MetaXTools" -version = "1.114.1" +version = "1.114.4" description = "MetaXTools is a novel tool for linking peptide sequences with taxonomic and functional information in Metaproteomics." readme = "README_PyPi.md" license = { text = "NorthOmics" }