From e7aad1f175853584caaffa355efcc42f3a2712a0 Mon Sep 17 00:00:00 2001 From: Qing <44231502+byemaxx@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:43:13 -0400 Subject: [PATCH 1/4] - New: added a new Normalization method: "Trace Shifting" for the data preprossing. --- Docs/ChangeLog.md | 6 +- Docs/MetaX_Cookbook.md | 4 ++ metax/gui/main_gui.py | 1 + metax/gui/metax_gui/main_window.ui | 25 +++++---- metax/gui/metax_gui/ui_main_window.py | 24 ++++---- metax/taxafunc_analyzer/analyzer.py | 31 +++++++++-- .../analyzer_utils/data_preprocessing.py | 8 ++- metax/taxafunc_analyzer/analyzer_utils/lfq.py | 55 +++++++++++++++++-- metax/utils/version.py | 2 +- pyproject.toml | 2 +- 10 files changed, 122 insertions(+), 36 deletions(-) diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md index fd2c231..d24f626 100644 --- a/Docs/ChangeLog.md +++ b/Docs/ChangeLog.md @@ -1,9 +1,13 @@ +# Version: 1.115.2 +## Date: 2024-10-03 +### Changes: +- New: added a new Normalization method: "Trace Shifting" for the data preprossing. + # Version: 1.115.1 ## Date: 2024-10-03 ### Changes: - New: added multiprocessing option for LFQ. - # Version: 1.115.0 ## Date: 2024-10-02 ### Changes: diff --git a/Docs/MetaX_Cookbook.md b/Docs/MetaX_Cookbook.md index d496bc0..1808fe0 100644 --- a/Docs/MetaX_Cookbook.md +++ b/Docs/MetaX_Cookbook.md @@ -339,7 +339,11 @@ There are several methods for detecting and handling outliers. - **Data Normalization:** + - **Trace Shifting:** Reframing the Normalization Problem with Intensity traces (inspired by DirectLFQ). + - Note: If both trace shifting and transformation are applied, *normalization will be done before transformation.* + - Standard Scaling (Z-Score), Min-Max Scaling, Pareto Scaling, Mean centring and Normalization by sum. + If you use [Z-Score, Mean centring and Pareto Scaling] data normalization, the data will be given a minimum offset again to avoid negative values. diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py index 3e133cc..a7332de 100644 --- a/metax/gui/main_gui.py +++ b/metax/gui/main_gui.py @@ -2619,6 +2619,7 @@ def set_multi_table(self, restore_taxafunc=False, saved_obj=None): } normalize_dict = { "None": None, + "Trace Shifting": "trace_shift", "Mean centering": "mean", "Standard Scaling (Z-Score)": "zscore", "Min-Max Scaling": "minmax", diff --git a/metax/gui/metax_gui/main_window.ui b/metax/gui/metax_gui/main_window.ui index 22e9bc0..530ed76 100644 --- a/metax/gui/metax_gui/main_window.ui +++ b/metax/gui/metax_gui/main_window.ui @@ -245,8 +245,8 @@ 0 0 - 528 - 534 + 391 + 80 @@ -939,6 +939,11 @@ None + + + Trace Shifting + + Standard Scaling (Z-Score) @@ -2771,7 +2776,7 @@ 0 0 - 999 + 621 150 @@ -3774,7 +3779,7 @@ 0 0 - 1020 + 878 128 @@ -5668,8 +5673,8 @@ 0 0 - 996 - 103 + 535 + 94 @@ -7449,8 +7454,8 @@ 0 0 - 1016 - 105 + 620 + 65 @@ -8158,8 +8163,8 @@ 0 0 - 1016 - 185 + 775 + 102 diff --git a/metax/gui/metax_gui/ui_main_window.py b/metax/gui/metax_gui/ui_main_window.py index f077358..efb045f 100644 --- a/metax/gui/metax_gui/ui_main_window.py +++ b/metax/gui/metax_gui/ui_main_window.py @@ -147,7 +147,7 @@ def setupUi(self, metaX_main): self.toolBox_2.setMaximumSize(QtCore.QSize(1677, 16777215)) self.toolBox_2.setObjectName("toolBox_2") self.page_2 = QtWidgets.QWidget() - self.page_2.setGeometry(QtCore.QRect(0, 0, 528, 534)) + self.page_2.setGeometry(QtCore.QRect(0, 0, 391, 80)) self.page_2.setObjectName("page_2") self.gridLayout_27 = QtWidgets.QGridLayout(self.page_2) self.gridLayout_27.setObjectName("gridLayout_27") @@ -484,6 +484,7 @@ def setupUi(self, metaX_main): self.comboBox_set_data_normalization.addItem("") self.comboBox_set_data_normalization.addItem("") self.comboBox_set_data_normalization.addItem("") + self.comboBox_set_data_normalization.addItem("") self.gridLayout_15.addWidget(self.comboBox_set_data_normalization, 7, 1, 1, 3) self.comboBox_remove_batch_effect = QtWidgets.QComboBox(self.tab_set_taxa_func) sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Preferred, QtWidgets.QSizePolicy.Fixed) @@ -1407,7 +1408,7 @@ def setupUi(self, metaX_main): self.scrollArea_2.setWidgetResizable(True) self.scrollArea_2.setObjectName("scrollArea_2") self.scrollAreaWidgetContents_2 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_2.setGeometry(QtCore.QRect(0, 0, 999, 150)) + self.scrollAreaWidgetContents_2.setGeometry(QtCore.QRect(0, 0, 621, 150)) self.scrollAreaWidgetContents_2.setObjectName("scrollAreaWidgetContents_2") self.gridLayout_50 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_2) self.gridLayout_50.setObjectName("gridLayout_50") @@ -1956,7 +1957,7 @@ def setupUi(self, metaX_main): self.scrollArea_cross_heatmap_settings.setWidgetResizable(True) self.scrollArea_cross_heatmap_settings.setObjectName("scrollArea_cross_heatmap_settings") self.scrollAreaWidgetContents_3 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_3.setGeometry(QtCore.QRect(0, 0, 1020, 128)) + self.scrollAreaWidgetContents_3.setGeometry(QtCore.QRect(0, 0, 878, 128)) self.scrollAreaWidgetContents_3.setObjectName("scrollAreaWidgetContents_3") self.gridLayout_38 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_3) self.gridLayout_38.setObjectName("gridLayout_38") @@ -2890,7 +2891,7 @@ def setupUi(self, metaX_main): self.scrollArea_3.setWidgetResizable(True) self.scrollArea_3.setObjectName("scrollArea_3") self.scrollAreaWidgetContents_4 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_4.setGeometry(QtCore.QRect(0, 0, 996, 103)) + self.scrollAreaWidgetContents_4.setGeometry(QtCore.QRect(0, 0, 535, 94)) self.scrollAreaWidgetContents_4.setObjectName("scrollAreaWidgetContents_4") self.gridLayout_68 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_4) self.gridLayout_68.setObjectName("gridLayout_68") @@ -3836,7 +3837,7 @@ def setupUi(self, metaX_main): self.scrollArea_5.setWidgetResizable(True) self.scrollArea_5.setObjectName("scrollArea_5") self.scrollAreaWidgetContents_6 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_6.setGeometry(QtCore.QRect(0, 0, 1016, 105)) + self.scrollAreaWidgetContents_6.setGeometry(QtCore.QRect(0, 0, 620, 65)) self.scrollAreaWidgetContents_6.setObjectName("scrollAreaWidgetContents_6") self.gridLayout_57 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_6) self.gridLayout_57.setObjectName("gridLayout_57") @@ -4217,7 +4218,7 @@ def setupUi(self, metaX_main): self.scrollArea_6.setWidgetResizable(True) self.scrollArea_6.setObjectName("scrollArea_6") self.scrollAreaWidgetContents_7 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_7.setGeometry(QtCore.QRect(0, 0, 1016, 185)) + self.scrollAreaWidgetContents_7.setGeometry(QtCore.QRect(0, 0, 775, 102)) self.scrollAreaWidgetContents_7.setObjectName("scrollAreaWidgetContents_7") self.gridLayout_69 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_7) self.gridLayout_69.setObjectName("gridLayout_69") @@ -5555,11 +5556,12 @@ def retranslateUi(self, metaX_main): self.comboBox_outlier_handling_method2.setItemText(4, _translate("metaX_main", "regression")) self.label_102.setText(_translate("metaX_main", "Outliers Handling by")) self.comboBox_set_data_normalization.setItemText(0, _translate("metaX_main", "None")) - self.comboBox_set_data_normalization.setItemText(1, _translate("metaX_main", "Standard Scaling (Z-Score)")) - self.comboBox_set_data_normalization.setItemText(2, _translate("metaX_main", "Min-Max Scaling")) - self.comboBox_set_data_normalization.setItemText(3, _translate("metaX_main", "Pareto Scaling")) - self.comboBox_set_data_normalization.setItemText(4, _translate("metaX_main", "Mean centering")) - self.comboBox_set_data_normalization.setItemText(5, _translate("metaX_main", "Normalization by sum")) + self.comboBox_set_data_normalization.setItemText(1, _translate("metaX_main", "Trace Shifting")) + self.comboBox_set_data_normalization.setItemText(2, _translate("metaX_main", "Standard Scaling (Z-Score)")) + self.comboBox_set_data_normalization.setItemText(3, _translate("metaX_main", "Min-Max Scaling")) + self.comboBox_set_data_normalization.setItemText(4, _translate("metaX_main", "Pareto Scaling")) + self.comboBox_set_data_normalization.setItemText(5, _translate("metaX_main", "Mean centering")) + self.comboBox_set_data_normalization.setItemText(6, _translate("metaX_main", "Normalization by sum")) self.comboBox_remove_batch_effect.setItemText(0, _translate("metaX_main", "None")) self.label_41.setText(_translate("metaX_main", "Data Normalization")) self.label_43.setText(_translate("metaX_main", "Batch Effect Correction")) diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py index 8dbb7ac..24e37ff 100644 --- a/metax/taxafunc_analyzer/analyzer.py +++ b/metax/taxafunc_analyzer/analyzer.py @@ -688,12 +688,31 @@ def run_lfq_for_taxa_func(self, df_taxa_func): df_taxa_func = df_taxa_func.set_index(['Taxon', self.func_name], drop=True) return df_taxa_func + + def update_data_preprocess_parameters(self, data_preprocess_params, peptide_num_threshold): + data_preprocess_params['peptide_num_threshold'] = peptide_num_threshold + + normalize_method = data_preprocess_params['normalize_method'] + transform_method = data_preprocess_params['transform_method'] + processing_order = data_preprocess_params['processing_order'] + + if 'trace_shift' == normalize_method and transform_method not in ['None', None]: + print(f'Warning: [Trace Shifting] and {transform_method} are both set, Normalize will be prior to Transform.') + # move 'normalize' to the first + processing_order = ['normalize'] + [i for i in processing_order if i != 'normalize'] + print(f'Data Preprocessing order: {processing_order}') + + data_preprocess_params['processing_order'] = processing_order + + + return data_preprocess_params + def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, outlier_params: dict = {'detect_method': None, 'handle_method': None, "detection_by_group" : None, "handle_by_group": None}, data_preprocess_params: dict = {'normalize_method': None, 'transform_method': None, - 'batch_meta': None, 'processing_order': None}, + 'batch_meta': None, 'processing_order': ['transform', 'normalize', 'batch']}, peptide_num_threshold: dict = {'taxa': 1, 'func': 1, 'taxa_func': 1}, sum_protein:bool = False, sum_protein_params: dict = {'method': 'razor', 'by_sample': False, @@ -731,8 +750,8 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, return #! fllowing code is for the normal mode - # add 'peptide_num_threshold' to 'data_preprocess_params - data_preprocess_params['peptide_num_threshold'] = peptide_num_threshold + # Update 'data_preprocess_params' + data_preprocess_params = self.update_data_preprocess_parameters(data_preprocess_params, peptide_num_threshold) #2. sum the protein intensity if sum_protein: @@ -1009,15 +1028,15 @@ def get_df(self, table_name:str = 'taxa'): outlier_params = {'detect_method': 'zero-dominant', 'handle_method': 'original', "detection_by_group" : 'Individual', "handle_by_group": None}, data_preprocess_params = { - 'normalize_method': None, + 'normalize_method': 'trace_shift', 'transform_method': "log2", 'batch_meta': 'None', - 'processing_order': None}, + 'processing_order': ['transform', 'normalize', 'batch']}, peptide_num_threshold = {'taxa': 2, 'func': 2, 'taxa_func': 2}, keep_unknow_func=False, sum_protein=False, sum_protein_params = {'method': 'razor', 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap', 'peptide_num_threshold': 3}, split_func=True, split_func_params = {'split_by': '|', 'share_intensity': False}, - taxa_and_func_only_from_otf=False, quant_method='lfq' + taxa_and_func_only_from_otf=False, quant_method='sum' ) sw.check_attributes() \ No newline at end of file diff --git a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py index 9df4db8..fef5f33 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py +++ b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py @@ -101,6 +101,10 @@ def _data_transform(self, df: pd.DataFrame, transform_method: str|None = None) - def _data_normalization(self, df: pd.DataFrame, normalize_method: str|None = None) -> pd.DataFrame: + def trace_shift(x): + from .lfq import run_normalization + return run_normalization(x) + if normalize_method is None: print('normalize_method is not set, data normalization did not perform.') else: @@ -117,7 +121,8 @@ def _data_normalization(self, df: pd.DataFrame, normalize_method: str|None = Non 'sum': lambda x: x / (x.sum() + epsilon), 'minmax': lambda x: (x - x.min()) / (x.max() - x.min()), 'zscore': lambda x: (x - x.mean()) / (x.std() + epsilon), - 'pareto': lambda x: (x - x.mean()) / (np.sqrt(x.std() + epsilon)) + 'pareto': lambda x: (x - x.mean()) / (np.sqrt(x.std() + epsilon)), + 'trace_shift': lambda x: trace_shift(x) } if normalize_method in normalize_operations: @@ -624,6 +629,7 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None, - `normalize_method` (`str`, optional): Method used for data normalization. Options include: - `None`: No normalization. + - `trace_shift`: Trace shift normalization inspired by DirectLFQ. - `mean`: Mean normalization. - `sum`: Sum normalization. - `minmax`: Min-max normalization. diff --git a/metax/taxafunc_analyzer/analyzer_utils/lfq.py b/metax/taxafunc_analyzer/analyzer_utils/lfq.py index 05fa0a8..abe5dcc 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/lfq.py +++ b/metax/taxafunc_analyzer/analyzer_utils/lfq.py @@ -2,7 +2,6 @@ import pandas as pd import numpy as np from numba import njit -import multiprocessing import os # Setup logging @@ -11,7 +10,7 @@ def setup_logging(): logging.basicConfig( level=logging.INFO, - format="LFQ: %(message)s", + format="%(asctime)s - %(levelname)s - %(message)s", ) setup_logging() @@ -451,7 +450,7 @@ def get_list_of_tuple_w_protein_profiles_and_shifted_peptides( ) if num_cores is not None and num_cores > 1: - # Use multiprocessing + import multiprocessing pool = multiprocessing.Pool(num_cores) args = [ ( @@ -644,12 +643,53 @@ def get_ion_intensity_dataframe_from_list_of_shifted_peptides( return ion_df +def is_numeric_matrix(df): + # mark non-numeric values as NaN + numeric_df = df.apply(pd.to_numeric, errors='coerce') + # check if nan values are present + return numeric_df.notna().all().all() + + +def run_normalization( + input_df: pd.DataFrame, + number_of_quadratic_samples: int = 100 + ): + ''' + Normalize the input DataFrame. + Args: + input_df (pd.DataFrame): A matrix of intensities.Columns are samples, index is items to be normalized. + number_of_quadratic_samples (int, optional): How many samples are used to create the anchor intensity trace. Increasing might marginally increase performance at the cost of runtime + Returns: + pd.DataFrame: The normalized DataFrame. + ''' + # chcek if only numbers are in the dataframe + if not is_numeric_matrix(input_df): + raise ValueError("Input DataFrame contains non-numeric values. Make sure to the items column is set as index.") + + copy_numpy_arrays = check_whether_to_copy_numpy_arrays_derived_from_pandas() + input_df = np.log2(input_df.replace(0, np.nan)) # type: ignore + input_df = input_df.dropna(axis=0, how="all") + + LOGGER.info("Performing sample normalization.") + input_df = NormalizationManagerSamplesOnSelectedProteins( + input_df, + num_samples_quadratic=number_of_quadratic_samples, + selected_proteins_file=None, + copy_numpy_arrays=copy_numpy_arrays, + ).complete_dataframe + # restore log2 values + input_df = 2 ** input_df + # fill NaNs with 0 + input_df = input_df.fillna(0) + + return input_df + def run_lfq( input_df, protein_id: str = "protein", quant_id: str = "ion", min_nonan: int = 1, - number_of_quadratic_samples: int = 50, + number_of_quadratic_samples: int = 100, maximum_number_of_quadratic_ions_to_use_per_protein: int = 10, log_processed_proteins: bool = True, compile_normalized_ion_table: bool = True, @@ -699,17 +739,22 @@ def run_lfq( df_path = os.path.join(current_dir, "../../../local_tests/peptide_for_protein.tsv") df = pd.read_csv(df_path, sep="\t") + # protein_df = df.drop(columns=["Proteins"]) + # protein_df.set_index("Sequence", inplace=True) + # print(protein_df.head()) + # df1 = run_normalization(protein_df) protein_df, ion_df = run_lfq( df, protein_id="Proteins", quant_id="Sequence", min_nonan=1, - number_of_quadratic_samples=50, + number_of_quadratic_samples=500, maximum_number_of_quadratic_ions_to_use_per_protein=10, num_cores=None, use_multiprocessing=True ) + print(protein_df.shape) print(protein_df.head()) t2 = time.time() diff --git a/metax/utils/version.py b/metax/utils/version.py index cfa61a3..15f221c 100644 --- a/metax/utils/version.py +++ b/metax/utils/version.py @@ -1,2 +1,2 @@ -__version__ = '1.115.1' +__version__ = '1.115.2' API_version = '3' \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 8d4e3df..a7b6371 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "MetaXTools" -version = "1.115.1" +version = "1.115.2" description = "MetaXTools is a novel tool for linking peptide sequences with taxonomic and functional information in Metaproteomics." readme = "README_PyPi.md" license = { text = "NorthOmics" } From 349e3a6da09b28ff0c6f1e58151b0f0880ddf93e Mon Sep 17 00:00:00 2001 From: Qing <44231502+byemaxx@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:15:31 -0400 Subject: [PATCH 2/4] - Fix: Fixed the filter of pvalue or padj option not work for functional redundancy analysis in T-Test part. --- Docs/ChangeLog.md | 7 ++++++- metax/gui/main_gui.py | 5 +++-- metax/taxafunc_analyzer/analyzer_utils/cross_test.py | 2 +- metax/utils/version.py | 2 +- pyproject.toml | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md index d24f626..05e1ae3 100644 --- a/Docs/ChangeLog.md +++ b/Docs/ChangeLog.md @@ -1,5 +1,10 @@ +# Version: 1.115.3 +## Date: 2024-10-04 +### Changes: +- Fix: Fixed the filter of pvalue or padj option not work for functional redundancy analysis in T-Test part. + # Version: 1.115.2 -## Date: 2024-10-03 +## Date: 2024-10-04 ### Changes: - New: added a new Normalization method: "Trace Shifting" for the data preprossing. diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py index a7332de..10cbe18 100644 --- a/metax/gui/main_gui.py +++ b/metax/gui/main_gui.py @@ -4881,12 +4881,13 @@ def t_test(self): try: self.pushButton_ttest.setEnabled(False) group_list = [group1, group2] - table_names = [] + table_names = [] # reset table_names as empty list if df_type == 'Significant Taxa-Func'.lower(): p_value = self.doubleSpinBox_top_heatmap_pvalue.value() p_value = round(p_value, 4) + p_type = self.comboBox_top_heatmap_p_type.currentText() - ttest_sig_tf_params = {'group_list': group_list, 'p_value': p_value, 'condition': condition} + ttest_sig_tf_params = {'group_list': group_list, 'p_value': p_value, 'condition': condition, "p_type": p_type} self.run_in_new_window(self.tfa.CrossTest.get_stats_diff_taxa_but_func, callback= self.callback_after_ttest, **ttest_sig_tf_params) diff --git a/metax/taxafunc_analyzer/analyzer_utils/cross_test.py b/metax/taxafunc_analyzer/analyzer_utils/cross_test.py index 1139341..6ae5e51 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/cross_test.py +++ b/metax/taxafunc_analyzer/analyzer_utils/cross_test.py @@ -598,7 +598,7 @@ def get_stats_diff_taxa_but_func(self, group_list: list|None = None, p_value: fl condition:list|None =None, p_type: str = 'padj' ) -> tuple: p_col_name = 'pvalue' if p_type == 'pvalue' else 'padj' - + print(f"Using [{p_col_name}] for filtering") # calculate the test result if not given if taxa_res_df is None or func_res_df is None or taxa_func_res_df is None: print("No test result given, calculating the test result first") diff --git a/metax/utils/version.py b/metax/utils/version.py index 15f221c..06dc858 100644 --- a/metax/utils/version.py +++ b/metax/utils/version.py @@ -1,2 +1,2 @@ -__version__ = '1.115.2' +__version__ = '1.115.3' API_version = '3' \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a7b6371..b99ae8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "MetaXTools" -version = "1.115.2" +version = "1.115.3" description = "MetaXTools is a novel tool for linking peptide sequences with taxonomic and functional information in Metaproteomics." readme = "README_PyPi.md" license = { text = "NorthOmics" } From 9060e6d8204c3c712bd46a57bc5ff9d259548012 Mon Sep 17 00:00:00 2001 From: Qing <44231502+byemaxx@users.noreply.github.com> Date: Thu, 10 Oct 2024 15:18:46 -0400 Subject: [PATCH 3/4] move peptide number threshold out of the data_preprocessing to get the accurate number of peptides used in the analysis --- Docs/ChangeLog.md | 5 + metax/gui/main_gui.py | 3 +- metax/taxafunc_analyzer/analyzer.py | 92 +++++++++++++++++-- .../analyzer_utils/data_preprocessing.py | 15 +-- metax/utils/version.py | 2 +- 5 files changed, 92 insertions(+), 25 deletions(-) diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md index 05e1ae3..3c07c5b 100644 --- a/Docs/ChangeLog.md +++ b/Docs/ChangeLog.md @@ -1,3 +1,8 @@ +# Version: 1.115.4 +## Date: 2024-10-07 +### Changes: +- TODO: use the peptide number for 'self.peptide_num_used' after filtering the minimum peptide number + # Version: 1.115.3 ## Date: 2024-10-04 ### Changes: diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py index 10cbe18..73d4479 100644 --- a/metax/gui/main_gui.py +++ b/metax/gui/main_gui.py @@ -1857,7 +1857,6 @@ def run_after_set_multi_tables(self): msg = f"""

Custom data is ready!

-

{nan_stats_str}

Number of items: [{num_item}]

@@ -1890,7 +1889,7 @@ def run_after_set_multi_tables(self): - + diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py index 24e37ff..b21e774 100644 --- a/metax/taxafunc_analyzer/analyzer.py +++ b/metax/taxafunc_analyzer/analyzer.py @@ -689,8 +689,7 @@ def run_lfq_for_taxa_func(self, df_taxa_func): return df_taxa_func - def update_data_preprocess_parameters(self, data_preprocess_params, peptide_num_threshold): - data_preprocess_params['peptide_num_threshold'] = peptide_num_threshold + def update_data_preprocess_parameters(self, data_preprocess_params): normalize_method = data_preprocess_params['normalize_method'] transform_method = data_preprocess_params['transform_method'] @@ -706,7 +705,67 @@ def update_data_preprocess_parameters(self, data_preprocess_params, peptide_num_ return data_preprocess_params + + def filter_peptides_num_for_splited_func(self, df, peptide_num_threshold, df_type, distinct_threshold_mode=False): + ''' + Only for the splited func table or taxa_func table + - df: the splited func table or taxa_func table which has been grouped, index is the func or taxa_func + - peptide_num_threshold: the threshold of peptide number for each func or taxa_func + - df_type: 'func' or 'taxa_func' + - distinct_threshold_mode: TODO + ''' + + valid_df_types = ['func', 'taxa_func'] + if df_type not in valid_df_types: + raise ValueError(f"df_type must be one of {valid_df_types}, your input is [{df_type}]") + + peptide_num= peptide_num_threshold[df_type] + df_original_len = len(df) + + df = df[df['peptide_num'] >= peptide_num] + print(f"Removed [{df_original_len - len(df)} {df_type}] with less than [{peptide_num}] peptides.") + return df + + + + def filter_peptides_num(self, df, peptide_num_threshold, df_type, distinct_threshold_mode=False): + ''' + Filter the peptides based on the peptide number threshold + - df: the original df including peptides, taxa, and functions, etc. + - peptide_num_threshold: the threshold of peptide number for each taxa or func + - df_type: 'taxa', 'func', or 'taxa_func' + - distinct_threshold_mode: TODO + ''' + valid_df_types = ['taxa', 'func', 'taxa_func'] + if df_type not in valid_df_types: + raise ValueError(f"df_type must be one of {valid_df_types}, your input is [{df_type}]") + + peptide_num= peptide_num_threshold[df_type] + df_original_len = len(df) + + if df_type == 'taxa_func': + item_col = 'taxa_func' + df['taxa_func'] = df['Taxon'] + '&&&&' + df[self.func_name] + else: + item_col = 'Taxon' if df_type == 'taxa' else self.func_name + + # Group by item_col and filter based on peptide number + dict_item_pep_num = df.groupby(item_col).size().to_dict() + remove_list = [k for k, v in dict_item_pep_num.items() if v < peptide_num] + + # Remove rows based on peptide number threshold + df = df[~df[item_col].isin(remove_list)] + + if df_type == 'taxa_func': + df = df.drop('taxa_func', axis=1) + + self.peptide_num_used[df_type] = len(df) + print(f"Removed [{len(remove_list)} {df_type}] from [{df_original_len - len(df)} Peptides] with less than [{peptide_num}] peptides.") + + return df + + def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, outlier_params: dict = {'detect_method': None, 'handle_method': None, @@ -751,7 +810,7 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, #! fllowing code is for the normal mode # Update 'data_preprocess_params' - data_preprocess_params = self.update_data_preprocess_parameters(data_preprocess_params, peptide_num_threshold) + data_preprocess_params = self.update_data_preprocess_parameters(data_preprocess_params) #2. sum the protein intensity if sum_protein: @@ -821,10 +880,15 @@ def strip_taxa(x, level): if not taxa_and_func_only_from_otf: # extract 'taxa', sample intensity #! and 'peptide_col' to avoid the duplicated items when handling outlier - df_taxa_pep = df_filtered_peptides[[self.peptide_col_name,'Taxon'] + self.sample_list] + df_taxa_pep = df_filtered_peptides[[self.peptide_col_name,'Taxon'] + self.sample_list] # type: ignore print("\n-----Starting to perform outlier detection and handling for [Peptide-Taxon] table...-----") df_taxa_pep = self.detect_and_handle_outliers(df=df_taxa_pep, **outlier_params) - self.peptide_num_used['taxa'] = len(df_taxa_pep) + #TODO: use the peptide number after filtering the minimum peptide number + # statastic the peptide number of each taxa + df_taxa_pep = self.filter_peptides_num(df=df_taxa_pep, peptide_num_threshold=peptide_num_threshold, df_type='taxa') + + + # self.peptide_num_used['taxa'] = len(df_taxa_pep) # add column 'peptide_num' to df_taxa as 1 df_taxa_pep['peptide_num'] = 1 @@ -844,7 +908,8 @@ def strip_taxa(x, level): df_func_pep = df_func_pep[[self.peptide_col_name, self.func_name] + self.sample_list] print("\n-----Starting to perform outlier detection and handling for [Peptide-Function] table...-----") df_func_pep = self.detect_and_handle_outliers(df=df_func_pep, **outlier_params) - self.peptide_num_used['func'] = len(df_func_pep) + if not split_func: + df_func_pep = self.filter_peptides_num(df=df_func_pep, peptide_num_threshold=peptide_num_threshold, df_type='func') df_func_pep['peptide_num'] = 1 if quant_method == 'lfq': @@ -853,8 +918,10 @@ def strip_taxa(x, level): df_func = df_func_pep.groupby(self.func_name).sum(numeric_only=True) if split_func: + self.peptide_num_used['func'] = len(df_func_pep) df_func = self.split_func(df=df_func, split_func_params=split_func_params, df_type='func') - + df_func = self.filter_peptides_num_for_splited_func(df=df_func, peptide_num_threshold=peptide_num_threshold, df_type='func') + df_func = self.data_preprocess(df=df_func,df_name = 'func', **data_preprocess_params) self.func_df = df_func #-----Func Table End----- @@ -887,6 +954,8 @@ def strip_taxa(x, level): # ----- create taxa_func table ----- df_taxa_func = df_half_processed_peptides[[self.peptide_col_name, 'Taxon', self.func_name] + self.sample_list] df_taxa_func['peptide_num'] = 1 + if not split_func: + df_taxa_func = self.filter_peptides_num(df=df_taxa_func, peptide_num_threshold=peptide_num_threshold, df_type='taxa_func') for key in ['taxa_func', 'taxa', 'func']: self.peptide_num_used[key] = len(df_taxa_func) if self.peptide_num_used[key] == 0 else self.peptide_num_used[key] @@ -899,6 +968,9 @@ def strip_taxa(x, level): # split the function before data preprocess if split_func: df_taxa_func = self.split_func( df=df_taxa_func, split_func_params=split_func_params, df_type='taxa_func') + df_taxa_func = self.filter_peptides_num_for_splited_func(df=df_taxa_func, peptide_num_threshold=peptide_num_threshold, + df_type='taxa_func') + print("\n-----Starting to perform data pre-processing for [Taxa-Function] table...-----") df_taxa_func_all_processed = self.data_preprocess(df=df_taxa_func @@ -1025,14 +1097,14 @@ def get_df(self, table_name:str = 'taxa'): sw.set_func('KEGG_Pathway_name') sw.set_group('Individual') sw.set_multi_tables(level='s', - outlier_params = {'detect_method': 'zero-dominant', 'handle_method': 'original', + outlier_params = {'detect_method': 'None', 'handle_method': 'original', "detection_by_group" : 'Individual', "handle_by_group": None}, data_preprocess_params = { - 'normalize_method': 'trace_shift', + 'normalize_method': 'None', 'transform_method': "log2", 'batch_meta': 'None', 'processing_order': ['transform', 'normalize', 'batch']}, - peptide_num_threshold = {'taxa': 2, 'func': 2, 'taxa_func': 2}, + peptide_num_threshold = {'taxa': 3, 'func': 3, 'taxa_func': 3}, keep_unknow_func=False, sum_protein=False, sum_protein_params = {'method': 'razor', 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap', 'peptide_num_threshold': 3}, split_func=True, split_func_params = {'split_by': '|', 'share_intensity': False}, diff --git a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py index fef5f33..a7ab723 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py +++ b/metax/taxafunc_analyzer/analyzer_utils/data_preprocessing.py @@ -614,7 +614,7 @@ def detect_and_handle_outliers(self, df: pd.DataFrame, def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None, transform_method: str|None = None, batch_meta: str|None =None, processing_order:list|None =None, - df_name:str = "None", peptide_num_threshold:dict[str, int] ={'taxa': 1, 'func': 1, 'taxa_func': 1} + df_name:str = "None" ) -> pd.DataFrame: """ ## `data_preprocess` Method @@ -662,11 +662,7 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None, - `taxa_func` - `protein` - `custom` - - `peptide_num_threshold` (`dict`, optional): - The threshold for the number of peptides in each DataFrame. Default values are: - - `taxa`: 3 - - `func`: 3 - - `taxa_func`: 3 + ### Returns: @@ -677,12 +673,7 @@ def data_preprocess(self, df: pd.DataFrame, normalize_method: str|None = None, df = df.copy() - # remove items with peptide number less than threshold - if df_name in ['taxa', 'func', 'taxa_func']: - print(f'{df_name.upper()} number before removing: {df.shape[0]}') - df = df[df['peptide_num'] >= peptide_num_threshold[df_name]] - print(f'{df_name.upper()} number with peptide_num >= [{peptide_num_threshold[df_name]}]: {df.shape[0]}') - + if processing_order is None: processing_order = ['transform', 'normalize', 'batch'] else: diff --git a/metax/utils/version.py b/metax/utils/version.py index 06dc858..b0d8e7d 100644 --- a/metax/utils/version.py +++ b/metax/utils/version.py @@ -1,2 +1,2 @@ -__version__ = '1.115.3' +__version__ = '1.115.4' API_version = '3' \ No newline at end of file From 7343ea13b2fd8deea78f6c56e882a070934993e8 Mon Sep 17 00:00:00 2001 From: Qing Date: Wed, 16 Oct 2024 22:49:07 -0400 Subject: [PATCH 4/4] - Fix: Fixed the bug of when use Anydata moed, the report will raise error. - Change: changed the approche of filter the minimum number of peptides threshold for the protein.(Avaliable for Razor and Anti-Razor method) --- Docs/ChangeLog.md | 3 +- metax/gui/main_gui.py | 8 +- metax/gui/metax_gui/main_window.ui | 146 +++++++++--------- metax/gui/metax_gui/ui_main_window.py | 102 ++++++------ metax/taxafunc_analyzer/analyzer.py | 74 ++++++++- .../analyzer_utils/razor_sum.py | 3 +- .../analyzer_utils/sum_protein_intensity.py | 57 +++++-- pyproject.toml | 2 +- 8 files changed, 243 insertions(+), 152 deletions(-) diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md index 3c07c5b..9e78d66 100644 --- a/Docs/ChangeLog.md +++ b/Docs/ChangeLog.md @@ -1,7 +1,8 @@ # Version: 1.115.4 ## Date: 2024-10-07 ### Changes: -- TODO: use the peptide number for 'self.peptide_num_used' after filtering the minimum peptide number +- Fix: Fixed the bug of when use Anydata moed, the report will raise error. +- Change: changed the approche of filter the minimum number of peptides threshold for the protein.(Avaliable for Razor and Anti-Razor method) # Version: 1.115.3 ## Date: 2024-10-04 diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py index 73d4479..7230595 100644 --- a/metax/gui/main_gui.py +++ b/metax/gui/main_gui.py @@ -993,10 +993,14 @@ def update_method_of_protein_inference(self): self.checkBox_infrence_protein_by_sample.setChecked(True) self.checkBox_infrence_protein_by_sample.setEnabled(False) self.comboBox_protein_ranking_method.setEnabled(False) + # enable the peptide_num_threshold + self.spinBox_peptide_num_threshold_protein.setEnabled(True) else: # method is ["rank"] self.checkBox_infrence_protein_by_sample.setEnabled(True) self.comboBox_protein_ranking_method.setEnabled(True) self.checkBox_infrence_protein_by_sample.setChecked(False) + # disable the peptide_num_threshold + self.spinBox_peptide_num_threshold_protein.setEnabled(False) @@ -1853,11 +1857,11 @@ def run_after_set_multi_tables(self): # Final message if self.tfa.any_df_mode: - num_item = self.tfa.custom_df.shape[0] + original_num_peptide = self.tfa.custom_df.shape[0] msg = f"""

Custom data is ready!

-

Number of items: [{num_item}]

+

Number of items: [{original_num_peptide}]

""" diff --git a/metax/gui/metax_gui/main_window.ui b/metax/gui/metax_gui/main_window.ui index 530ed76..818a6dc 100644 --- a/metax/gui/metax_gui/main_window.ui +++ b/metax/gui/metax_gui/main_window.ui @@ -46,7 +46,7 @@ Qt::LeftToRight - 2 + 4 false @@ -245,8 +245,8 @@ 0 0 - 391 - 80 + 528 + 534 @@ -1476,7 +1476,7 @@ 16777215 - 280 + 300 @@ -1505,7 +1505,7 @@ 0 0 - 660 + 1016 232 @@ -2759,7 +2759,7 @@ 16777215 - 280 + 300 @@ -2776,7 +2776,7 @@ 0 0 - 621 + 999 150 @@ -3750,7 +3750,7 @@ 16777215 - 240 + 280 @@ -3779,7 +3779,7 @@ 0 0 - 878 + 1020 128 @@ -4801,7 +4801,7 @@ QTabWidget::Triangular - 3 + 2 @@ -5144,7 +5144,7 @@ - + false @@ -5239,64 +5239,11 @@ - - - - Qt::Horizontal - - - - - - - false - - - Run Deseq2 - - - - - - - - - - - 0 - 0 - - - - Groups (Default all) - - - - - - - Control Group - - - - - - - - 0 - 0 - - - - Comparing in Each Condition - - - @@ -5321,8 +5268,61 @@ + + + + + 0 + 0 + + + + Comparing in Each Condition + + + + + + + + + + + 0 + 0 + + + + Groups (Default all) + + + + + + + Control Group + + + + + + + Qt::Horizontal + + + + + + + false + + + Run Deseq2 + + + @@ -6207,7 +6207,7 @@ 16777215 - 220 + 240 @@ -7437,7 +7437,7 @@ 16777215 - 220 + 240 @@ -7454,8 +7454,8 @@ 0 0 - 620 - 65 + 1016 + 105 @@ -7819,7 +7819,7 @@ QTabWidget::Triangular - 0 + 1 @@ -8146,7 +8146,7 @@ 16777215 - 220 + 240 @@ -9258,7 +9258,7 @@ 16777215 - 220 + 240 @@ -9275,8 +9275,8 @@ 0 0 - 383 - 68 + 1016 + 141 diff --git a/metax/gui/metax_gui/ui_main_window.py b/metax/gui/metax_gui/ui_main_window.py index efb045f..f1c6c9b 100644 --- a/metax/gui/metax_gui/ui_main_window.py +++ b/metax/gui/metax_gui/ui_main_window.py @@ -147,7 +147,7 @@ def setupUi(self, metaX_main): self.toolBox_2.setMaximumSize(QtCore.QSize(1677, 16777215)) self.toolBox_2.setObjectName("toolBox_2") self.page_2 = QtWidgets.QWidget() - self.page_2.setGeometry(QtCore.QRect(0, 0, 391, 80)) + self.page_2.setGeometry(QtCore.QRect(0, 0, 528, 534)) self.page_2.setObjectName("page_2") self.gridLayout_27 = QtWidgets.QGridLayout(self.page_2) self.gridLayout_27.setObjectName("gridLayout_27") @@ -730,7 +730,7 @@ def setupUi(self, metaX_main): self.line_7.setObjectName("line_7") self.gridLayout_26.addWidget(self.line_7, 1, 0, 1, 3) self.groupBox_basic_plot = QtWidgets.QGroupBox(self.tab_12) - self.groupBox_basic_plot.setMaximumSize(QtCore.QSize(16777215, 280)) + self.groupBox_basic_plot.setMaximumSize(QtCore.QSize(16777215, 300)) self.groupBox_basic_plot.setObjectName("groupBox_basic_plot") self.gridLayout_40 = QtWidgets.QGridLayout(self.groupBox_basic_plot) self.gridLayout_40.setObjectName("gridLayout_40") @@ -744,7 +744,7 @@ def setupUi(self, metaX_main): self.scrollArea.setWidgetResizable(True) self.scrollArea.setObjectName("scrollArea") self.scrollAreaWidgetContents = QtWidgets.QWidget() - self.scrollAreaWidgetContents.setGeometry(QtCore.QRect(0, 0, 660, 232)) + self.scrollAreaWidgetContents.setGeometry(QtCore.QRect(0, 0, 1016, 232)) self.scrollAreaWidgetContents.setObjectName("scrollAreaWidgetContents") self.gridLayout_34 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents) self.gridLayout_34.setObjectName("gridLayout_34") @@ -1400,7 +1400,7 @@ def setupUi(self, metaX_main): self.pushButton_basic_heatmap_add.setObjectName("pushButton_basic_heatmap_add") self.gridLayout_23.addWidget(self.pushButton_basic_heatmap_add, 5, 3, 1, 1) self.groupBox_basic_heatmap_plot_settings = QtWidgets.QGroupBox(self.tab_13) - self.groupBox_basic_heatmap_plot_settings.setMaximumSize(QtCore.QSize(16777215, 280)) + self.groupBox_basic_heatmap_plot_settings.setMaximumSize(QtCore.QSize(16777215, 300)) self.groupBox_basic_heatmap_plot_settings.setObjectName("groupBox_basic_heatmap_plot_settings") self.gridLayout_41 = QtWidgets.QGridLayout(self.groupBox_basic_heatmap_plot_settings) self.gridLayout_41.setObjectName("gridLayout_41") @@ -1408,7 +1408,7 @@ def setupUi(self, metaX_main): self.scrollArea_2.setWidgetResizable(True) self.scrollArea_2.setObjectName("scrollArea_2") self.scrollAreaWidgetContents_2 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_2.setGeometry(QtCore.QRect(0, 0, 621, 150)) + self.scrollAreaWidgetContents_2.setGeometry(QtCore.QRect(0, 0, 999, 150)) self.scrollAreaWidgetContents_2.setObjectName("scrollAreaWidgetContents_2") self.gridLayout_50 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_2) self.gridLayout_50.setObjectName("gridLayout_50") @@ -1943,7 +1943,7 @@ def setupUi(self, metaX_main): self.gridLayout_46.addWidget(self.checkBox_2, 1, 0, 1, 1) self.gridLayout_75.addLayout(self.gridLayout_46, 0, 0, 1, 1) self.groupBox_cross_heatmap_settings = QtWidgets.QGroupBox(self.groupBox_cross_heatmap_plot) - self.groupBox_cross_heatmap_settings.setMaximumSize(QtCore.QSize(16777215, 240)) + self.groupBox_cross_heatmap_settings.setMaximumSize(QtCore.QSize(16777215, 280)) self.groupBox_cross_heatmap_settings.setObjectName("groupBox_cross_heatmap_settings") self.gridLayout_52 = QtWidgets.QGridLayout(self.groupBox_cross_heatmap_settings) self.gridLayout_52.setObjectName("gridLayout_52") @@ -1957,7 +1957,7 @@ def setupUi(self, metaX_main): self.scrollArea_cross_heatmap_settings.setWidgetResizable(True) self.scrollArea_cross_heatmap_settings.setObjectName("scrollArea_cross_heatmap_settings") self.scrollAreaWidgetContents_3 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_3.setGeometry(QtCore.QRect(0, 0, 878, 128)) + self.scrollAreaWidgetContents_3.setGeometry(QtCore.QRect(0, 0, 1020, 128)) self.scrollAreaWidgetContents_3.setObjectName("scrollAreaWidgetContents_3") self.gridLayout_38 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_3) self.gridLayout_38.setObjectName("gridLayout_38") @@ -2591,7 +2591,7 @@ def setupUi(self, metaX_main): self.pushButton_dunnett_test = QtWidgets.QPushButton(self.tab_16) self.pushButton_dunnett_test.setEnabled(False) self.pushButton_dunnett_test.setObjectName("pushButton_dunnett_test") - self.gridLayout_33.addWidget(self.pushButton_dunnett_test, 10, 1, 1, 2) + self.gridLayout_33.addWidget(self.pushButton_dunnett_test, 10, 1, 1, 1) self.horizontalLayout_39 = QtWidgets.QHBoxLayout() self.horizontalLayout_39.setObjectName("horizontalLayout_39") self.label_112 = QtWidgets.QLabel(self.tab_16) @@ -2636,42 +2636,11 @@ def setupUi(self, metaX_main): self.horizontalLayout_73.addWidget(self.comboBox_group_control_condition_group) self.horizontalLayout_39.addLayout(self.horizontalLayout_73) self.gridLayout_33.addLayout(self.horizontalLayout_39, 1, 1, 1, 2) - self.line_26 = QtWidgets.QFrame(self.tab_16) - self.line_26.setFrameShape(QtWidgets.QFrame.HLine) - self.line_26.setFrameShadow(QtWidgets.QFrame.Sunken) - self.line_26.setObjectName("line_26") - self.gridLayout_33.addWidget(self.line_26, 9, 1, 1, 2) - self.pushButton_multi_deseq2 = QtWidgets.QPushButton(self.tab_16) - self.pushButton_multi_deseq2.setEnabled(False) - self.pushButton_multi_deseq2.setObjectName("pushButton_multi_deseq2") - self.gridLayout_33.addWidget(self.pushButton_multi_deseq2, 11, 1, 1, 2) self.gridLayout_72 = QtWidgets.QGridLayout() self.gridLayout_72.setObjectName("gridLayout_72") - self.horizontalLayout_dunnett_group = QtWidgets.QHBoxLayout() - self.horizontalLayout_dunnett_group.setObjectName("horizontalLayout_dunnett_group") - self.gridLayout_72.addLayout(self.horizontalLayout_dunnett_group, 1, 1, 1, 1) - self.label_114 = QtWidgets.QLabel(self.tab_16) - sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Preferred, QtWidgets.QSizePolicy.Fixed) - sizePolicy.setHorizontalStretch(0) - sizePolicy.setVerticalStretch(0) - sizePolicy.setHeightForWidth(self.label_114.sizePolicy().hasHeightForWidth()) - self.label_114.setSizePolicy(sizePolicy) - self.label_114.setObjectName("label_114") - self.gridLayout_72.addWidget(self.label_114, 0, 1, 1, 1) self.comboBox_dunnett_control_group = QtWidgets.QComboBox(self.tab_16) self.comboBox_dunnett_control_group.setObjectName("comboBox_dunnett_control_group") self.gridLayout_72.addWidget(self.comboBox_dunnett_control_group, 1, 0, 1, 1) - self.label_115 = QtWidgets.QLabel(self.tab_16) - self.label_115.setObjectName("label_115") - self.gridLayout_72.addWidget(self.label_115, 0, 0, 1, 1) - self.checkBox_comparing_group_control_in_condition = QtWidgets.QCheckBox(self.tab_16) - sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Preferred, QtWidgets.QSizePolicy.Fixed) - sizePolicy.setHorizontalStretch(0) - sizePolicy.setVerticalStretch(0) - sizePolicy.setHeightForWidth(self.checkBox_comparing_group_control_in_condition.sizePolicy().hasHeightForWidth()) - self.checkBox_comparing_group_control_in_condition.setSizePolicy(sizePolicy) - self.checkBox_comparing_group_control_in_condition.setObjectName("checkBox_comparing_group_control_in_condition") - self.gridLayout_72.addWidget(self.checkBox_comparing_group_control_in_condition, 2, 0, 1, 1) self.horizontalLayout_24 = QtWidgets.QHBoxLayout() self.horizontalLayout_24.setObjectName("horizontalLayout_24") self.label_140 = QtWidgets.QLabel(self.tab_16) @@ -2687,7 +2656,38 @@ def setupUi(self, metaX_main): self.comboBox_group_control_comparing_each_condition_meta.setObjectName("comboBox_group_control_comparing_each_condition_meta") self.horizontalLayout_24.addWidget(self.comboBox_group_control_comparing_each_condition_meta) self.gridLayout_72.addLayout(self.horizontalLayout_24, 2, 1, 1, 1) + self.checkBox_comparing_group_control_in_condition = QtWidgets.QCheckBox(self.tab_16) + sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Preferred, QtWidgets.QSizePolicy.Fixed) + sizePolicy.setHorizontalStretch(0) + sizePolicy.setVerticalStretch(0) + sizePolicy.setHeightForWidth(self.checkBox_comparing_group_control_in_condition.sizePolicy().hasHeightForWidth()) + self.checkBox_comparing_group_control_in_condition.setSizePolicy(sizePolicy) + self.checkBox_comparing_group_control_in_condition.setObjectName("checkBox_comparing_group_control_in_condition") + self.gridLayout_72.addWidget(self.checkBox_comparing_group_control_in_condition, 2, 0, 1, 1) + self.horizontalLayout_dunnett_group = QtWidgets.QHBoxLayout() + self.horizontalLayout_dunnett_group.setObjectName("horizontalLayout_dunnett_group") + self.gridLayout_72.addLayout(self.horizontalLayout_dunnett_group, 1, 1, 1, 1) + self.label_114 = QtWidgets.QLabel(self.tab_16) + sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Preferred, QtWidgets.QSizePolicy.Fixed) + sizePolicy.setHorizontalStretch(0) + sizePolicy.setVerticalStretch(0) + sizePolicy.setHeightForWidth(self.label_114.sizePolicy().hasHeightForWidth()) + self.label_114.setSizePolicy(sizePolicy) + self.label_114.setObjectName("label_114") + self.gridLayout_72.addWidget(self.label_114, 0, 1, 1, 1) + self.label_115 = QtWidgets.QLabel(self.tab_16) + self.label_115.setObjectName("label_115") + self.gridLayout_72.addWidget(self.label_115, 0, 0, 1, 1) self.gridLayout_33.addLayout(self.gridLayout_72, 4, 1, 1, 2) + self.line_26 = QtWidgets.QFrame(self.tab_16) + self.line_26.setFrameShape(QtWidgets.QFrame.HLine) + self.line_26.setFrameShadow(QtWidgets.QFrame.Sunken) + self.line_26.setObjectName("line_26") + self.gridLayout_33.addWidget(self.line_26, 9, 1, 1, 2) + self.pushButton_multi_deseq2 = QtWidgets.QPushButton(self.tab_16) + self.pushButton_multi_deseq2.setEnabled(False) + self.pushButton_multi_deseq2.setObjectName("pushButton_multi_deseq2") + self.gridLayout_33.addWidget(self.pushButton_multi_deseq2, 10, 2, 1, 1) self.tabWidget_3.addTab(self.tab_16, "") self.tab_19 = QtWidgets.QWidget() self.tab_19.setObjectName("tab_19") @@ -3184,7 +3184,7 @@ def setupUi(self, metaX_main): self.gridLayout_co_expr_sample.setObjectName("gridLayout_co_expr_sample") self.gridLayout_47.addLayout(self.gridLayout_co_expr_sample, 3, 1, 1, 3) self.groupBox_co_expression_plot_settings = QtWidgets.QGroupBox(self.tab_5) - self.groupBox_co_expression_plot_settings.setMaximumSize(QtCore.QSize(16777215, 220)) + self.groupBox_co_expression_plot_settings.setMaximumSize(QtCore.QSize(16777215, 240)) self.groupBox_co_expression_plot_settings.setObjectName("groupBox_co_expression_plot_settings") self.gridLayout_56 = QtWidgets.QGridLayout(self.groupBox_co_expression_plot_settings) self.gridLayout_56.setObjectName("gridLayout_56") @@ -3829,7 +3829,7 @@ def setupUi(self, metaX_main): self.label_100.setObjectName("label_100") self.gridLayout_24.addWidget(self.label_100, 5, 0, 1, 1) self.groupBox_expression_trends_plot_settings = QtWidgets.QGroupBox(self.tab_15) - self.groupBox_expression_trends_plot_settings.setMaximumSize(QtCore.QSize(16777215, 220)) + self.groupBox_expression_trends_plot_settings.setMaximumSize(QtCore.QSize(16777215, 240)) self.groupBox_expression_trends_plot_settings.setObjectName("groupBox_expression_trends_plot_settings") self.gridLayout_60 = QtWidgets.QGridLayout(self.groupBox_expression_trends_plot_settings) self.gridLayout_60.setObjectName("gridLayout_60") @@ -3837,7 +3837,7 @@ def setupUi(self, metaX_main): self.scrollArea_5.setWidgetResizable(True) self.scrollArea_5.setObjectName("scrollArea_5") self.scrollAreaWidgetContents_6 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_6.setGeometry(QtCore.QRect(0, 0, 620, 65)) + self.scrollAreaWidgetContents_6.setGeometry(QtCore.QRect(0, 0, 1016, 105)) self.scrollAreaWidgetContents_6.setObjectName("scrollAreaWidgetContents_6") self.gridLayout_57 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_6) self.gridLayout_57.setObjectName("gridLayout_57") @@ -4210,7 +4210,7 @@ def setupUi(self, metaX_main): self.label_149.setObjectName("label_149") self.gridLayout_4.addWidget(self.label_149, 0, 0, 1, 1) self.groupBox_taxa_func_link_plot_settings = QtWidgets.QGroupBox(self.tab_8) - self.groupBox_taxa_func_link_plot_settings.setMaximumSize(QtCore.QSize(16777215, 220)) + self.groupBox_taxa_func_link_plot_settings.setMaximumSize(QtCore.QSize(16777215, 240)) self.groupBox_taxa_func_link_plot_settings.setObjectName("groupBox_taxa_func_link_plot_settings") self.gridLayout_65 = QtWidgets.QGridLayout(self.groupBox_taxa_func_link_plot_settings) self.gridLayout_65.setObjectName("gridLayout_65") @@ -4838,7 +4838,7 @@ def setupUi(self, metaX_main): self.pushButton_plot_network.setObjectName("pushButton_plot_network") self.gridLayout_6.addWidget(self.pushButton_plot_network, 10, 1, 1, 3) self.groupBox_taxa_func_link_net_plot_settings = QtWidgets.QGroupBox(self.tab_9) - self.groupBox_taxa_func_link_net_plot_settings.setMaximumSize(QtCore.QSize(16777215, 220)) + self.groupBox_taxa_func_link_net_plot_settings.setMaximumSize(QtCore.QSize(16777215, 240)) self.groupBox_taxa_func_link_net_plot_settings.setObjectName("groupBox_taxa_func_link_net_plot_settings") self.gridLayout_63 = QtWidgets.QGridLayout(self.groupBox_taxa_func_link_net_plot_settings) self.gridLayout_63.setObjectName("gridLayout_63") @@ -4846,7 +4846,7 @@ def setupUi(self, metaX_main): self.scrollArea_7.setWidgetResizable(True) self.scrollArea_7.setObjectName("scrollArea_7") self.scrollAreaWidgetContents_8 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_8.setGeometry(QtCore.QRect(0, 0, 383, 68)) + self.scrollAreaWidgetContents_8.setGeometry(QtCore.QRect(0, 0, 1016, 141)) self.scrollAreaWidgetContents_8.setObjectName("scrollAreaWidgetContents_8") self.gridLayout_66 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_8) self.gridLayout_66.setObjectName("gridLayout_66") @@ -5417,12 +5417,12 @@ def setupUi(self, metaX_main): self.retranslateUi(metaX_main) self.stackedWidget.setCurrentIndex(0) - self.tabWidget_TaxaFuncAnalyzer.setCurrentIndex(2) + self.tabWidget_TaxaFuncAnalyzer.setCurrentIndex(4) self.toolBox_2.setCurrentIndex(0) self.tabWidget_4.setCurrentIndex(1) - self.tabWidget_3.setCurrentIndex(3) + self.tabWidget_3.setCurrentIndex(2) self.tabWidget.setCurrentIndex(1) - self.tabWidget_2.setCurrentIndex(0) + self.tabWidget_2.setCurrentIndex(1) self.tabWidget_6.setCurrentIndex(1) self.toolBox_metalab_res_anno.setCurrentIndex(0) self.tabWidget_5.setCurrentIndex(0) @@ -5839,11 +5839,11 @@ def retranslateUi(self, metaX_main): self.comboBox_table_for_dunnett.setItemText(3, _translate("metaX_main", "peptides")) self.label_113.setText(_translate("metaX_main", "Meta")) self.checkBox_group_control_in_condition.setText(_translate("metaX_main", "In Condition")) - self.pushButton_multi_deseq2.setText(_translate("metaX_main", "Run Deseq2")) + self.label_140.setText(_translate("metaX_main", " By:")) + self.checkBox_comparing_group_control_in_condition.setText(_translate("metaX_main", "Comparing in Each Condition")) self.label_114.setText(_translate("metaX_main", "Groups (Default all)")) self.label_115.setText(_translate("metaX_main", "Control Group")) - self.checkBox_comparing_group_control_in_condition.setText(_translate("metaX_main", "Comparing in Each Condition")) - self.label_140.setText(_translate("metaX_main", " By:")) + self.pushButton_multi_deseq2.setText(_translate("metaX_main", "Run Deseq2")) self.tabWidget_3.setTabText(self.tabWidget_3.indexOf(self.tab_16), _translate("metaX_main", "Group-Control TEST ")) self.label_166.setText(_translate("metaX_main", "Groups")) self.pushButton_deseq2.setText(_translate("metaX_main", "Run DESeq2")) diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py index b21e774..540e421 100644 --- a/metax/taxafunc_analyzer/analyzer.py +++ b/metax/taxafunc_analyzer/analyzer.py @@ -54,6 +54,7 @@ def __init__( self.peptide_col_name = peptide_col_name self.protein_col_name = protein_col_name + self.protein_separator = ';' self.custom_col_name = custom_col_name self.sample_list: Optional[List[str]] = None self.meta_df: Optional[pd.DataFrame] = None @@ -78,6 +79,7 @@ def __init__( self.any_df_mode = any_df_mode # if True, the consider the TaxaFunc df as other_df self.custom_df: Optional[pd.DataFrame] = None # other df, any df that user want to add self.peptide_num_used = {'taxa': 0, 'func': 0, 'taxa_func': 0, 'protein': 0} + self.distinct_peptides_list: list|None = None self.split_func_status:bool = False self.split_func_sep:str = '' @@ -689,6 +691,18 @@ def run_lfq_for_taxa_func(self, df_taxa_func): return df_taxa_func + def calculate_distinct_peptides(self): #! NOT USED YET + # extract the peptide column and protein_col_name + print("Calculating distinct peptides list...") + extract_cols = [self.peptide_col_name, self.protein_col_name] + df = self.original_df[extract_cols] + separate_protein = self.protein_separator + df['protein_num'] = df[self.protein_col_name].apply(lambda x: len(x.split(separate_protein))) + df = df[df['protein_num'] == 1] + distinct_peptides = df[self.peptide_col_name].tolist() + self.distinct_peptides_list = distinct_peptides + + def update_data_preprocess_parameters(self, data_preprocess_params): normalize_method = data_preprocess_params['normalize_method'] @@ -706,13 +720,12 @@ def update_data_preprocess_parameters(self, data_preprocess_params): return data_preprocess_params - def filter_peptides_num_for_splited_func(self, df, peptide_num_threshold, df_type, distinct_threshold_mode=False): + def filter_peptides_num_for_splited_func(self, df, peptide_num_threshold, df_type): ''' Only for the splited func table or taxa_func table - df: the splited func table or taxa_func table which has been grouped, index is the func or taxa_func - peptide_num_threshold: the threshold of peptide number for each func or taxa_func - df_type: 'func' or 'taxa_func' - - distinct_threshold_mode: TODO ''' valid_df_types = ['func', 'taxa_func'] @@ -750,6 +763,49 @@ def filter_peptides_num(self, df, peptide_num_threshold, df_type, distinct_thres else: item_col = 'Taxon' if df_type == 'taxa' else self.func_name + # # if True: #! Need to be implemented + # if distinct_threshold_mode: + # if self.distinct_peptides_list is None: + # self.calculate_distinct_peptides() + + # peptides_in_taxa_func = defaultdict(list) + # peptides_in_taxa = defaultdict(list) + # peptides_in_func = defaultdict(list) + # skiped_peptides_list = [] + # for row in tqdm(df.itertuples(index=False), total=len(df), desc="Creating peptides_dict"): + # peptide = row[0] + # if peptide not in self.distinct_peptides_list: + # skiped_peptides_list.append(peptide) + # continue + + # if df_type == 'taxa': + # taxa = row[1] + # # Append peptide to taxa list + # peptides_in_taxa[taxa].append(peptide) + + # if self.split_func_status: + # func_list = [f.strip() for f in row[2].split(self.split_func_sep)] + # # Process each function in the func_list + # for func in func_list: + # peptides_in_func[func].append(peptide) + # taxa_func = f'{taxa}&&&&{func}' + # peptides_in_taxa_func[taxa_func].append(peptide) + # else: + # if df_type in ['func', 'taxa_func']: + # taxa = row[1] + # func = row[2] + # # Append peptide to func list + # peptides_in_func[func].append(peptide) + # # Create combined key for taxa_func + # taxa_func = f'{taxa}&&&&{func}' + # peptides_in_taxa_func[taxa_func].append(peptide) + + # peitides_dict = {'taxa': peptides_in_taxa, 'func': peptides_in_func, 'taxa_func': peptides_in_taxa_func} + # remove_list = [k for k, v in peitides_dict[df_type].items() if len(v) < peptide_num] + # skiped_peptides_list = set(skiped_peptides_list) + + + # else: # Group by item_col and filter based on peptide number dict_item_pep_num = df.groupby(item_col).size().to_dict() remove_list = [k for k, v in dict_item_pep_num.items() if v < peptide_num] @@ -761,7 +817,7 @@ def filter_peptides_num(self, df, peptide_num_threshold, df_type, distinct_thres df = df.drop('taxa_func', axis=1) self.peptide_num_used[df_type] = len(df) - print(f"Removed [{len(remove_list)} {df_type}] from [{df_original_len - len(df)} Peptides] with less than [{peptide_num}] peptides.") + print(f"Removed [{len(set((remove_list)))} {df_type}] from [{df_original_len - len(df)} Peptides] with less than [{peptide_num}] peptides.") return df @@ -819,7 +875,10 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, self.peptide_num_used['protein'] = 0 sum_protein_params['quant_method'] = quant_method df_peptide_for_protein = self.detect_and_handle_outliers(df=self.original_df, **outlier_params) - self.protein_df = SumProteinIntensity(taxa_func_analyzer=self, df=df_peptide_for_protein).sum_protein_intensity( **sum_protein_params) + self.protein_df = SumProteinIntensity(taxa_func_analyzer=self, df=df_peptide_for_protein, + peptide_num_threshold=sum_protein_params['peptide_num_threshold'], + protein_separator = self.protein_separator + ).sum_protein_intensity( **sum_protein_params) self.protein_df = self.data_preprocess(df=self.protein_df,df_name = 'protein', **data_preprocess_params) @@ -1105,9 +1164,10 @@ def get_df(self, table_name:str = 'taxa'): 'batch_meta': 'None', 'processing_order': ['transform', 'normalize', 'batch']}, peptide_num_threshold = {'taxa': 3, 'func': 3, 'taxa_func': 3}, - keep_unknow_func=False, sum_protein=False, - sum_protein_params = {'method': 'razor', 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap', 'peptide_num_threshold': 3}, - split_func=True, split_func_params = {'split_by': '|', 'share_intensity': False}, + keep_unknow_func=False, + sum_protein=True, + sum_protein_params = {'method': 'anti-razor', 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap', 'peptide_num_threshold': 3}, + split_func=False, split_func_params = {'split_by': '|', 'share_intensity': False}, taxa_and_func_only_from_otf=False, quant_method='sum' ) diff --git a/metax/taxafunc_analyzer/analyzer_utils/razor_sum.py b/metax/taxafunc_analyzer/analyzer_utils/razor_sum.py index 7b20bda..5caf2d1 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/razor_sum.py +++ b/metax/taxafunc_analyzer/analyzer_utils/razor_sum.py @@ -130,7 +130,8 @@ def get_mini_target_set(self, greedy_method='heap'): self.greedy_method = greedy_method print('Start to get minimum target set using method: [razor]') # only extract the peptide and target columns - extract_cols = [self.column_map['peptide'], self.column_map['target']] + self.column_map['sample_list'] if self.column_map['sample_list'] else [] + extract_cols = [self.column_map['peptide'], self.column_map['target']] + extract_cols = extract_cols + self.column_map['sample_list'] if self.column_map['sample_list'] else extract_cols # if NA in target column, or '', raise error if self.df[self.column_map['target']].isna().any() or '' in self.df[self.column_map['target']].values: raise ValueError(f'NA or empty value in target column: {self.column_map["target"]}') diff --git a/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py b/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py index 17209aa..d6c0e26 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py +++ b/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py @@ -69,7 +69,7 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un greedy_method: str, default 'heap'. only used for `razor` method options: ['greedy', 'heap'] peptide_num_threshold: int, default None - the protein must have at least 3 peptides to be considered as a target + the protein must have at least number peptides to be considered as a target quant_method: str, default 'sum' options: ['sum', 'lfq'] ''' @@ -82,23 +82,10 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un if peptide_num_threshold is not None: self.peptide_num_threshold = peptide_num_threshold - # remove the protein with less than the threshold of peptides - # use teh methood in RazorSum - razor_integrator = RazorSum(df=self.df, - column_map={ - 'peptide': self.tfa.peptide_col_name, - 'target': self.tfa.protein_col_name, - 'sample_list': self.tfa.sample_list, - }, - peptide_num_threshold=self.peptide_num_threshold, - share_intensity=self.share_intensity, - greedy_method=greedy_method, - protein_separator= self.protein_separator) - self.rank_method = rank_method self.check_protein_col() - self.df = razor_integrator.remove_protein_less_than_threshold() + #innitialize the peptide number used as the total number of peptides self.tfa.peptide_num_used['protein'] = len(self.df) if method == 'rank': @@ -121,8 +108,19 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un self._sum_protein_rank(sample, by_sample) elif method == 'razor': print('start to sum protein intensity using method: [razor]') + # use teh methood in RazorSum + razor_integrator = RazorSum(df=self.df, + column_map={ + 'peptide': self.tfa.peptide_col_name, + 'target': self.tfa.protein_col_name, + 'sample_list': self.tfa.sample_list, + }, + peptide_num_threshold=self.peptide_num_threshold, + share_intensity=self.share_intensity, + greedy_method=greedy_method, + protein_separator= self.protein_separator) if quant_method == 'sum': - razor_integrator.peptide_num_threshold = 1 # set the threshold to 1, to avoid run filter again + # razor_integrator.peptide_num_threshold = 1 # set the threshold to 1, to avoid run filter again res_df = razor_integrator.sum_protein_intensity(greedy_method=greedy_method) elif quant_method == 'lfq': from .lfq import run_lfq @@ -137,10 +135,13 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un # move teh 2 columns to the front res_df = res_df[['peptides', 'peptide_num'] + [col for col in res_df.columns if col not in ['peptides', 'peptide_num']]] + self.tfa.peptide_num_used['protein'] = len(razor_integrator.df) return res_df elif method == 'anti-razor': print(f"\n-------------Start to sum protein intensity using method: [{method}] by_sample: [True] rank_method: [Shared]-------------") + #calculate the peptide number for each protein + self.filter_protein_by_peptide_num() for sample in self.tfa.sample_list: self._sum_protein_anti_razor(sample) @@ -161,6 +162,30 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un return res_df + + def filter_protein_by_peptide_num(self): + if self.peptide_num_threshold < 2: + return self.df + else: + peptide_col_name = self.tfa.peptide_col_name + protein_col_name = self.tfa.protein_col_name + df= self.df.copy() + target_to_peptides = defaultdict(set) + for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Creating target to peptides mapping"): + sequence = row[peptide_col_name] + targets = row[protein_col_name].split(self.protein_separator) + for target in targets: + target_to_peptides[target].add(sequence) + proteins_less_than_threshold = [target for target, peps in target_to_peptides.items() if len(peps) < self.peptide_num_threshold] + print(f'Number of proteins with less than {self.peptide_num_threshold} peptides: {len(proteins_less_than_threshold)}') + # remove the proteins with less than 3 peptides from the protein column of the df + df[protein_col_name] = df[protein_col_name].apply(lambda x: ';'.join([protein for protein in x.split(self.protein_separator) if protein not in proteins_less_than_threshold])) + self.df[protein_col_name] = df[protein_col_name] + # remove the row with empty protein + self.df = self.df[self.df[protein_col_name].str.strip() != ''] + self.tfa.peptide_num_used['protein'] = len(self.df) + return self.df + # razor method def find_minimum_protein_set(self, peptides, protein_to_peptides): protein_to_peptides_copy = protein_to_peptides.copy() diff --git a/pyproject.toml b/pyproject.toml index b99ae8e..bb5194f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "MetaXTools" -version = "1.115.3" +version = "1.115.4" description = "MetaXTools is a novel tool for linking peptide sequences with taxonomic and functional information in Metaproteomics." readme = "README_PyPi.md" license = { text = "NorthOmics" }
CategoryNumberNumber (After Filtering) Used Peptides % of All Peptides