From 3cccd4ebfeeb52fc7298f3c790af87d32fc3d5c9 Mon Sep 17 00:00:00 2001 From: Qing <44231502+byemaxx@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:19:48 -0400 Subject: [PATCH] - Fix: 1. Fixed the group order was not correct in the title of the volcano plot. 2. col scale bug when plot the basic heatmap. - Change: 1. Enable alpha/beta divversity for all type of tables. 2. Only asiign peptide to one protein rather than sahre the intensity when sum peptide to protein by razor method. --- Docs/ChangeLog.md | 7 + metax/gui/main_gui.py | 22 +- metax/gui/metax_gui/main_window.ui | 34 +-- metax/gui/metax_gui/ui_main_window.py | 26 +-- metax/peptide_annotator/convert_id_to_name.py | 16 +- metax/taxafunc_analyzer/analyzer.py | 1 + .../analyzer_utils/sum_protein_intensity.py | 25 +- metax/taxafunc_ploter/diversity_plot.py | 21 +- metax/taxafunc_ploter/heatmap_plot.py | 7 +- metax/taxafunc_ploter/volcano_plot.py | 18 +- metax/taxafunc_ploter/volcano_plot_js.py | 8 +- metax/utils/scripts/razor_sum.py | 214 ++++++++++++++++++ metax/utils/version.py | 2 +- 13 files changed, 336 insertions(+), 65 deletions(-) create mode 100644 metax/utils/scripts/razor_sum.py diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md index 6b68082..c078e15 100644 --- a/Docs/ChangeLog.md +++ b/Docs/ChangeLog.md @@ -1,3 +1,10 @@ +# Version: 1.110.0 +## Date: 2024-08-12 +### Changes: +- Fix: 1. Fixed the group order was not correct in the title of the volcano plot. 2. col scale bug when plot the basic heatmap. +- Change: 1. Enable alpha/beta divversity for all type of tables. 2. Only asiign peptide to one protein rather than sahre the intensity when sum peptide to protein by razor method. + + # Version: 1.109.12 ## Date: 2024-08-10 ### Changes: diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py index 2d48dbe..771f677 100644 --- a/metax/gui/main_gui.py +++ b/metax/gui/main_gui.py @@ -592,8 +592,11 @@ def get_list_by_df_type(self, df_type:str, remove_no_linked:bool=False, silent:b return res_list def change_event_checkBox_basic_plot_table(self): - taxa_only_button_list = [self.pushButton_plot_alpha_div, self.pushButton_plot_beta_div, - self.pushButton_plot_sunburst, self.pushButton_plot_basic_treemap] + taxa_only_button_list = [ + # self.pushButton_plot_alpha_div, + # self.pushButton_plot_beta_div, + self.pushButton_plot_sunburst, + self.pushButton_plot_basic_treemap] taxa_func_button_list = [self.pushButton_plot_basic_sankey] @@ -2984,6 +2987,8 @@ def enable_multi_button(self, state=True): self.pushButton_trends_clean_list, self.comboBox_trends_table, self.pushButton_plot_pca_js, + self.pushButton_plot_alpha_div, + self.pushButton_plot_beta_div, self.pushButton_trends_add_a_list, self.pushButton_co_expr_add_a_list, self.pushButton_basic_heatmap_add_a_list, @@ -4217,8 +4222,9 @@ def get_title_by_table_name(self, table_name): width=width, height=height, font_size=font_size, plot_all_samples=plot_all_samples, theme=theme, sub_meta = sub_meta, show_fliers = show_fliers, - legend_col_num=legend_col_num, rename_sample = rename_sample) - self.update_table_dict('alpha_diversity', aplha_diversity_df) + legend_col_num=legend_col_num, rename_sample = rename_sample, + df_type=table_name, title_name=title_name) + self.update_table_dict(f'alpha_diversity({title_name})', aplha_diversity_df) elif method == "beta_div": self.show_message('Beta diversity is running, please wait...') metric = self.comboBox_beta_div_method.currentText() @@ -4227,8 +4233,8 @@ def get_title_by_table_name(self, table_name): rename_sample = rename_sample, show_label = show_label, adjust_label = adjust_label, theme=theme,sub_meta = sub_meta, legend_col_num=legend_col_num, - dot_size = dot_size) - self.update_table_dict('beta_diversity_distance_matrix', beta_diversity_distance_matrix) + dot_size = dot_size, df_type=table_name, title_name=title_name) + self.update_table_dict(f'beta_diversity_distance_matrix({title_name})', beta_diversity_distance_matrix) elif method == 'sunburst': @@ -4926,7 +4932,7 @@ def plot_deseq2_volcano(self): height = self.spinBox_fc_plot_height.value() group1 = self.comboBox_deseq2_group1.currentText() group2 = self.comboBox_deseq2_group2.currentText() - title_name = f'{group1} vs {group2} of {table_name.split("(")[1].split(")")[0]}' + title_name = f'{group2} vs {group1} of {table_name.split("(")[1].split(")")[0]}' font_size = self.spinBox_deseq2_font_size.value() dot_size = self.spinBox_deseq2_dot_size.value() plot_js = self.checkBox_deseq2_js_volcano.isChecked() @@ -5085,7 +5091,7 @@ def deseq2_plot_sankey(self): return None try: df = self.table_dict[table_name] - title_name = f'{group1} vs {group2} of {table_name.split("(")[1].split(")")[0]}' + title_name = f'{group2} vs {group1} of {table_name.split("(")[1].split(")")[0]}' pic = SankeyPlot(self.tfa, theme=self.html_theme).plot_fc_sankey(df, width=width, height=height, pvalue=pvalue, p_type = p_type, log2fc_min=log2fc_min, log2fc_max=log2fc_max, title =title_name, font_size=font_size) diff --git a/metax/gui/metax_gui/main_window.ui b/metax/gui/metax_gui/main_window.ui index 2844d1d..b586e1c 100644 --- a/metax/gui/metax_gui/main_window.ui +++ b/metax/gui/metax_gui/main_window.ui @@ -46,7 +46,7 @@ Qt::LeftToRight - 4 + 3 false @@ -245,8 +245,8 @@ 0 0 - 528 - 573 + 391 + 80 @@ -1400,7 +1400,7 @@ 0 0 - 1016 + 660 232 @@ -2672,7 +2672,7 @@ 0 0 1016 - 162 + 158 @@ -2808,7 +2808,7 @@ - column + col @@ -3674,7 +3674,7 @@ 0 0 - 1003 + 1020 126 @@ -5543,7 +5543,7 @@ 0 0 996 - 146 + 140 @@ -6093,8 +6093,8 @@ 0 0 - 1016 - 181 + 493 + 128 @@ -7323,8 +7323,8 @@ 0 0 - 1016 - 144 + 538 + 63 @@ -8171,8 +8171,8 @@ 0 0 - 1016 - 185 + 775 + 102 @@ -9091,8 +9091,8 @@ 0 0 - 1016 - 168 + 383 + 68 @@ -10086,7 +10086,7 @@ 0 0 1122 - 21 + 23 diff --git a/metax/gui/metax_gui/ui_main_window.py b/metax/gui/metax_gui/ui_main_window.py index bf9e99b..e4c1333 100644 --- a/metax/gui/metax_gui/ui_main_window.py +++ b/metax/gui/metax_gui/ui_main_window.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Form implementation generated from reading ui file 'c:\Users\Qing\OneDrive - University of Ottawa\code\TaxaFunc\MetaX\metax\gui\metax_gui\main_window.ui' +# Form implementation generated from reading ui file 'c:\Users\max\OneDrive - University of Ottawa\code\TaxaFunc\MetaX\metax\gui\metax_gui\main_window.ui' # # Created by: PyQt5 UI code generator 5.15.9 # @@ -147,7 +147,7 @@ def setupUi(self, metaX_main): self.toolBox_2.setMaximumSize(QtCore.QSize(1677, 16777215)) self.toolBox_2.setObjectName("toolBox_2") self.page_2 = QtWidgets.QWidget() - self.page_2.setGeometry(QtCore.QRect(0, 0, 528, 573)) + self.page_2.setGeometry(QtCore.QRect(0, 0, 391, 80)) self.page_2.setObjectName("page_2") self.gridLayout_27 = QtWidgets.QGridLayout(self.page_2) self.gridLayout_27.setObjectName("gridLayout_27") @@ -706,7 +706,7 @@ def setupUi(self, metaX_main): self.scrollArea.setWidgetResizable(True) self.scrollArea.setObjectName("scrollArea") self.scrollAreaWidgetContents = QtWidgets.QWidget() - self.scrollAreaWidgetContents.setGeometry(QtCore.QRect(0, 0, 1016, 232)) + self.scrollAreaWidgetContents.setGeometry(QtCore.QRect(0, 0, 660, 232)) self.scrollAreaWidgetContents.setObjectName("scrollAreaWidgetContents") self.gridLayout_34 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents) self.gridLayout_34.setObjectName("gridLayout_34") @@ -1370,7 +1370,7 @@ def setupUi(self, metaX_main): self.scrollArea_2.setWidgetResizable(True) self.scrollArea_2.setObjectName("scrollArea_2") self.scrollAreaWidgetContents_2 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_2.setGeometry(QtCore.QRect(0, 0, 1016, 162)) + self.scrollAreaWidgetContents_2.setGeometry(QtCore.QRect(0, 0, 1016, 158)) self.scrollAreaWidgetContents_2.setObjectName("scrollAreaWidgetContents_2") self.gridLayout_50 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_2) self.gridLayout_50.setObjectName("gridLayout_50") @@ -1919,7 +1919,7 @@ def setupUi(self, metaX_main): self.scrollArea_cross_heatmap_settings.setWidgetResizable(True) self.scrollArea_cross_heatmap_settings.setObjectName("scrollArea_cross_heatmap_settings") self.scrollAreaWidgetContents_3 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_3.setGeometry(QtCore.QRect(0, 0, 1003, 126)) + self.scrollAreaWidgetContents_3.setGeometry(QtCore.QRect(0, 0, 1020, 126)) self.scrollAreaWidgetContents_3.setObjectName("scrollAreaWidgetContents_3") self.gridLayout_38 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_3) self.gridLayout_38.setObjectName("gridLayout_38") @@ -2844,7 +2844,7 @@ def setupUi(self, metaX_main): self.scrollArea_3.setWidgetResizable(True) self.scrollArea_3.setObjectName("scrollArea_3") self.scrollAreaWidgetContents_4 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_4.setGeometry(QtCore.QRect(0, 0, 996, 146)) + self.scrollAreaWidgetContents_4.setGeometry(QtCore.QRect(0, 0, 996, 140)) self.scrollAreaWidgetContents_4.setObjectName("scrollAreaWidgetContents_4") self.gridLayout_68 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_4) self.gridLayout_68.setObjectName("gridLayout_68") @@ -3145,7 +3145,7 @@ def setupUi(self, metaX_main): self.scrollArea_4.setWidgetResizable(True) self.scrollArea_4.setObjectName("scrollArea_4") self.scrollAreaWidgetContents_5 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_5.setGeometry(QtCore.QRect(0, 0, 1016, 181)) + self.scrollAreaWidgetContents_5.setGeometry(QtCore.QRect(0, 0, 493, 128)) self.scrollAreaWidgetContents_5.setObjectName("scrollAreaWidgetContents_5") self.gridLayout_49 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_5) self.gridLayout_49.setObjectName("gridLayout_49") @@ -3790,7 +3790,7 @@ def setupUi(self, metaX_main): self.scrollArea_5.setWidgetResizable(True) self.scrollArea_5.setObjectName("scrollArea_5") self.scrollAreaWidgetContents_6 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_6.setGeometry(QtCore.QRect(0, 0, 1016, 144)) + self.scrollAreaWidgetContents_6.setGeometry(QtCore.QRect(0, 0, 538, 63)) self.scrollAreaWidgetContents_6.setObjectName("scrollAreaWidgetContents_6") self.gridLayout_57 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_6) self.gridLayout_57.setObjectName("gridLayout_57") @@ -4261,7 +4261,7 @@ def setupUi(self, metaX_main): self.scrollArea_6.setWidgetResizable(True) self.scrollArea_6.setObjectName("scrollArea_6") self.scrollAreaWidgetContents_7 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_7.setGeometry(QtCore.QRect(0, 0, 1016, 185)) + self.scrollAreaWidgetContents_7.setGeometry(QtCore.QRect(0, 0, 775, 102)) self.scrollAreaWidgetContents_7.setObjectName("scrollAreaWidgetContents_7") self.gridLayout_69 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_7) self.gridLayout_69.setObjectName("gridLayout_69") @@ -4765,7 +4765,7 @@ def setupUi(self, metaX_main): self.scrollArea_7.setWidgetResizable(True) self.scrollArea_7.setObjectName("scrollArea_7") self.scrollAreaWidgetContents_8 = QtWidgets.QWidget() - self.scrollAreaWidgetContents_8.setGeometry(QtCore.QRect(0, 0, 1016, 168)) + self.scrollAreaWidgetContents_8.setGeometry(QtCore.QRect(0, 0, 383, 68)) self.scrollAreaWidgetContents_8.setObjectName("scrollAreaWidgetContents_8") self.gridLayout_66 = QtWidgets.QGridLayout(self.scrollAreaWidgetContents_8) self.gridLayout_66.setObjectName("gridLayout_66") @@ -5276,7 +5276,7 @@ def setupUi(self, metaX_main): self.statusbar.setObjectName("statusbar") metaX_main.setStatusBar(self.statusbar) self.menuBar = QtWidgets.QMenuBar(metaX_main) - self.menuBar.setGeometry(QtCore.QRect(0, 0, 1122, 21)) + self.menuBar.setGeometry(QtCore.QRect(0, 0, 1122, 23)) self.menuBar.setObjectName("menuBar") self.menuTools = QtWidgets.QMenu(self.menuBar) self.menuTools.setObjectName("menuTools") @@ -5336,7 +5336,7 @@ def setupUi(self, metaX_main): self.retranslateUi(metaX_main) self.stackedWidget.setCurrentIndex(0) - self.tabWidget_TaxaFuncAnalyzer.setCurrentIndex(4) + self.tabWidget_TaxaFuncAnalyzer.setCurrentIndex(3) self.toolBox_2.setCurrentIndex(0) self.tabWidget_4.setCurrentIndex(1) self.tabWidget_3.setCurrentIndex(3) @@ -5616,7 +5616,7 @@ def retranslateUi(self, metaX_main): self.label_186.setText(_translate("metaX_main", "Sankey")) self.label_31.setText(_translate("metaX_main", "Scale")) self.comboBox_basic_hetatmap_scale.setItemText(0, _translate("metaX_main", "row")) - self.comboBox_basic_hetatmap_scale.setItemText(1, _translate("metaX_main", "column")) + self.comboBox_basic_hetatmap_scale.setItemText(1, _translate("metaX_main", "col")) self.comboBox_basic_hetatmap_scale.setItemText(2, _translate("metaX_main", "all")) self.comboBox_basic_hetatmap_scale.setItemText(3, _translate("metaX_main", "None")) self.label_13.setText(_translate("metaX_main", "Theme")) diff --git a/metax/peptide_annotator/convert_id_to_name.py b/metax/peptide_annotator/convert_id_to_name.py index 783d720..45a5a28 100644 --- a/metax/peptide_annotator/convert_id_to_name.py +++ b/metax/peptide_annotator/convert_id_to_name.py @@ -177,15 +177,18 @@ def lookup_and_join(ec_nums, column_name): print("Add EC columns to df successfully!") return df -def add_pathway_name_to_df(df: pd.DataFrame) -> pd.DataFrame: - def query_kegg(id_str, pathway_dict): +def add_pathway_name_to_df(df: pd.DataFrame, kppe_id:bool = False) -> pd.DataFrame: + def query_kegg(id_str, pathway_dict, kppe_id=False): id_list = id_str.split(',') if id_list[0] == 'not_found': return 'not_found' pathway_list = [] for id in id_list: if id in pathway_dict: - pathway_list.append(pathway_dict[id]) + if kppe_id: + pathway_list.append(f'{id}:{pathway_dict[id]}') + else: + pathway_list.append(pathway_dict[id]) # remove duplicates pathway_list = list(dict.fromkeys(pathway_list)) if len(pathway_list) == 0: @@ -199,9 +202,12 @@ def query_kegg(id_str, pathway_dict): if 'KEGG_Pathway' not in df.columns: print('KEGG_Pathway column does not exist!, return the original dataframe') return df + + #! fill the missing pathway names if necessary + # df['KEGG_Pathway'] = df['KEGG_Pathway'].fillna('not_found') pathway_dict = get_pathway_dict() - df.loc[:, 'KEGG_Pathway_name'] = df['KEGG_Pathway'].apply(lambda x: query_kegg(x, pathway_dict)) + df.loc[:, 'KEGG_Pathway_name'] = df['KEGG_Pathway'].apply(lambda x: query_kegg(x, pathway_dict, kppe_id)) df.loc[:, 'KEGG_Pathway_name_prop'] = df['KEGG_Pathway_prop'] print("Add KEGG_Pathway_name to df successfully!") return df @@ -250,7 +256,7 @@ def query_ko(id_str, ko_dict): # if __name__ == '__main__': # df_path = "MetaX/data/example_data/Example_OTF.tsv" # df = pd.read_csv(df_path, sep='\t') -# df = add_pathway_name_to_df(df) +# df = add_pathway_name_to_df(df, kppe_id=True) # df = add_ec_name_to_df(df) # df = add_ko_name_to_df(df) # df.to_csv("11.tsv", sep='\t', index=False) \ No newline at end of file diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py index 2c79fb4..4d4f616 100644 --- a/metax/taxafunc_analyzer/analyzer.py +++ b/metax/taxafunc_analyzer/analyzer.py @@ -791,6 +791,7 @@ def get_df(self, table_name:str = 'taxa'): "proteins": "protein_df", } + table_name = table_name.lower() dft = getattr(self, name_dict[table_name]) # remove peptide_num column if exists if "peptide_num" in dft.columns: diff --git a/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py b/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py index 5a02486..8995e84 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py +++ b/metax/taxafunc_analyzer/analyzer_utils/sum_protein_intensity.py @@ -28,7 +28,10 @@ def __init__(self, taxa_func_analyzer): self.df = self.tfa.original_df.loc[:, self.extract_col_name] self._init_dicts() self.greedy_method = None # only used for razor method - + self.share_intensity = False + self.__multi_target_count = 0 + + def check_protein_col(self): # if any NA, '', or empty in the protein column, raise error if self.df[self.tfa.protein_col_name].isnull().values.any(): @@ -71,6 +74,8 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un # use Set Cover Problem to get the protein list, then sum the intensity pep_to_protein = self._create_pep_to_protein_razor() self._sum_protein_razor(pep_to_protein) + self.__multi_target_count = self.__multi_target_count/len(self.tfa.sample_list) + print(f'Peptides with multiple targets: {self.__multi_target_count} ({self.__multi_target_count/len(pep_to_protein)*100:.2f}%)') elif method == 'anti-razor': print(f"\n-------------Start to sum protein intensity using method: [{method}] by_sample: [True] rank_method: [Shared]-------------") @@ -269,13 +274,19 @@ def _update_output_dict(self, protein_list: list, sample_name:str, intensity:flo else: self.res_intensity_dict[sample_name][protein] = intensity else: - intensity = intensity/len(protein_list) - for protein in protein_list: - if protein in self.res_intensity_dict[sample_name].keys(): + if self.share_intensity: + intensity = intensity/len(protein_list) + for protein in protein_list: + self.res_intensity_dict.setdefault(sample_name, {}).setdefault(protein, 0) self.res_intensity_dict[sample_name][protein] += intensity - else: - self.res_intensity_dict[sample_name][protein] = intensity - + else: + self.__multi_target_count += 1 + protein = protein_list[0] + self.res_intensity_dict.setdefault(sample_name, {}).setdefault(protein, 0) + self.res_intensity_dict[sample_name][protein] += intensity + + + def _sum_protein_rank(self, sample_name:str, by_sample=False): # print in one line diff --git a/metax/taxafunc_ploter/diversity_plot.py b/metax/taxafunc_ploter/diversity_plot.py index fb89d86..966a6cc 100644 --- a/metax/taxafunc_ploter/diversity_plot.py +++ b/metax/taxafunc_ploter/diversity_plot.py @@ -27,10 +27,12 @@ def ace_with_threshold(self, row): def plot_alpha_diversity(self, metric:str='shannon', sample_list:list=None, width:int = 10, height:int = 8, font_size:int = 10, plot_all_samples:bool = False, theme:str = None, sub_meta:str = 'None', - show_fliers = True, legend_col_num: int | None = None, rename_sample:bool = False + show_fliers = True, legend_col_num: int | None = None, rename_sample:bool = False, + df_type:str = 'taxa', title_name:str = "Table" ): ''' Calculate alpha diversity and plot boxplot\n + df_type: ['taxa', 'functions', 'taxa_functions', return: (fig, aplha_diversity_df) ''' if sample_list is None: @@ -63,7 +65,8 @@ def plot_alpha_diversity(self, metric:str='shannon', sample_list:list=None, raise ValueError(f'Invalid metric: {metric}. Please choose from: {list(metric_dict.keys())}') try: - df = self.tfa.taxa_df.copy() + # df = self.tfa.taxa_df.copy() + df = self.tfa.get_df(df_type) df = df[sample_list] if metric == 'ace': @@ -135,7 +138,8 @@ def plot_alpha_diversity(self, metric:str='shannon', sample_list:list=None, fig.set_yticklabels(fig.get_yticks(), fontsize=font_size) fig.set_xlabel('Group', fontsize=font_size) fig.set_ylabel(f'{metric} Index', fontsize=font_size) - fig.set_title(f'Alpha Diversity ({metric})', fontsize=font_size+2, fontweight='bold') + fig.set_title(f'Alpha Diversity of {title_name} ({metric})', + fontsize=font_size+2, fontweight='bold') if sub_meta: if legend_col_num != 0: plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0., @@ -171,7 +175,8 @@ def plot_beta_diversity(self, metric:str='braycurtis', sample_list:list|None=Non width:int = 10, height:int = 8, font_size:int = 10, font_transparency:float = 0.8, show_label:bool = False,rename_sample:bool = False, adjust_label:bool = False , theme:str|None = None, sub_meta:str = "None", - legend_col_num: int | None = None, dot_size: float|None = None): + legend_col_num: int | None = None, dot_size: float|None = None, df_type:str = 'taxa', + title_name:str = "Table"): ''' Calculate beta diversity and plot PCoA plot Return:(fig, distance_matrix) @@ -200,7 +205,7 @@ def plot_beta_diversity(self, metric:str='braycurtis', sample_list:list|None=Non color_palette = None # Let seaborn handle the color mapping try: - df = self.tfa.taxa_df.copy() + df = self.tfa.get_df(df_type) df = df[sample_list] df = df.T @@ -225,8 +230,10 @@ def plot_beta_diversity(self, metric:str='braycurtis', sample_list:list|None=Non fig.set_ylabel("PC2 (%.2f%%)" % (pcoa_res.proportion_explained[1] * 100), fontsize=font_size) # set title num_legend = len(unique_groups) if sub_meta == 'None' else len(set(style_list)) + len(unique_groups) - - plt.title(f'PCoA plot of {metric} distance (Total explained variation: {pcoa_res.proportion_explained[0] * 100 + pcoa_res.proportion_explained[1] * 100:.2f}%)', fontsize=font_size+2, fontweight='bold') + + title = f'PCoA plot of {metric} distance {title_name} (Total explained variation: {pcoa_res.proportion_explained[0] * 100 + pcoa_res.proportion_explained[1] * 100:.2f}%)' + plt.title(title, fontsize=font_size+2, fontweight='bold') + if legend_col_num != 0: plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0., fontsize=font_size +2 , ncol= (num_legend//30 + 1) if legend_col_num is None else legend_col_num) diff --git a/metax/taxafunc_ploter/heatmap_plot.py b/metax/taxafunc_ploter/heatmap_plot.py index c961ff1..4c422d4 100644 --- a/metax/taxafunc_ploter/heatmap_plot.py +++ b/metax/taxafunc_ploter/heatmap_plot.py @@ -378,7 +378,7 @@ def plot_basic_heatmap(self, df, title = 'Heatmap',fig_size:tuple|None = None, fig.ax_heatmap.set_xticklabels(fig.ax_heatmap.get_xmajorticklabels(), fontsize=font_size, rotation=90) fig.ax_heatmap.set_yticklabels(fig.ax_heatmap.get_ymajorticklabels(), fontsize=font_size, rotation=0) - title = f"{title} (scaled by {scale})" + title = f"{title} (scaled by {scale})" if scale not in [None, 'None'] else title plt.suptitle(title, weight='bold') cbar = fig.ax_heatmap.collections[0].colorbar @@ -754,6 +754,11 @@ def scale_data(self, df: pd.DataFrame, scale_by: str|None = None, method: str|No return df df = df.copy() + + # convert 'column' to 'col' for consistency + if scale_by == 'column': + scale_by = 'col' + if scale_by not in ['row', 'col', 'all', 'none']: raise ValueError("scale_by must be 'row', 'col', 'all' or 'none'") diff --git a/metax/taxafunc_ploter/volcano_plot.py b/metax/taxafunc_ploter/volcano_plot.py index 8b10985..1571b96 100644 --- a/metax/taxafunc_ploter/volcano_plot.py +++ b/metax/taxafunc_ploter/volcano_plot.py @@ -10,7 +10,7 @@ def __init__(self): def plot_volcano(self, df_fc, pvalue: float = 0.05, p_type='padj', log2fc_min: float = 1, log2fc_max: float = 10, - title_name='2 groups',font_size:int=12, width=8, height=6, dot_size=15, theme:str|None = None): + title_name='2 groups',font_size:int=12, width=8, height=6, dot_size=15, theme:str|None = None, alpha=0.8): def color_mapping(type_value): if type_value == 'up': @@ -42,14 +42,22 @@ def color_mapping(type_value): # create the volcano plot plt.figure(figsize=(width, height)) - fig = sns.scatterplot(x=df['log2FoldChange'], y=-np.log10(df[p_type]), s=dot_size*10, hue=df['type'], alpha=0.8, - palette={'up': '#d23918', 'down': '#68945c', 'ultra-up': '#663d74', 'ultra-down': '#206864', 'normal': '#6b798e'}, linewidth=0.5, edgecolor='black') + fig = sns.scatterplot(x=df['log2FoldChange'], y=-np.log10(df[p_type]), s=dot_size*10, hue=df['type'], alpha=alpha, + # palette={'up': '#d23918', 'down': '#68945c', 'ultra-up': '#663d74', 'ultra-down': '#206864', 'normal': '#6b798e'}, + palette={'up': color_mapping('up'), 'down': color_mapping('down'), 'ultra-up': color_mapping('ultra-up'), 'ultra-down': color_mapping('ultra-down'), 'normal': color_mapping('normal')}, + linewidth=0.5, edgecolor='black') plt.axhline(y=-np.log10(pvalue), linestyle='--', color='grey', linewidth=1) # padj line plt.axvline(x=-log2fc_min, linestyle='--', color='grey', linewidth=1) # log2FoldChange line plt.axvline(x=log2fc_min, linestyle='--', color='grey', linewidth=1) # log2FoldChange line # set the title and labels - fig.set_title(f'Volcano plot of {title_name} ({"padj" if p_type == "padj" else "pvalue"} < {pvalue}, |log2FoldChange| > {log2fc_min})', fontsize=font_size) + # if ultra-up or ultra-down is not in the data, then don't show it in the title + if len(df[df['type'].isin(['ultra-up', 'ultra-down'])]) == 0: + log2fc_title = f'|log2FoldChange| >= {log2fc_min}' + else: + log2fc_title = f'{log2fc_min} <= |log2FoldChange| < {log2fc_max}' + + fig.set_title(f'Volcano plot of {title_name} ({"padj" if p_type == "padj" else "pvalue"} <= {pvalue}, {log2fc_title})', fontsize=font_size) fig.set_xlabel('log2FoldChange', fontsize=font_size) fig.set_ylabel('-log10(padj)', fontsize=font_size) sns.despine(trim=True) @@ -63,7 +71,7 @@ def color_mapping(type_value): if count_dict[t] == 0: continue # set the size of dot as font size*10, because when the font size is small, the dot will be overlapped - h = plt.scatter([], [], s=font_size*10, color=color_mapping(t), alpha=0.8, linewidth=0.5, edgecolor='black') + h = plt.scatter([], [], s=font_size*10, color=color_mapping(t), alpha=alpha, linewidth=0.5, edgecolor='black') handles.append(h) labels.append(f'{t} ({count_dict[t]})') fig.legend(handles=handles, labels=labels, diff --git a/metax/taxafunc_ploter/volcano_plot_js.py b/metax/taxafunc_ploter/volcano_plot_js.py index 413e43b..eb00289 100644 --- a/metax/taxafunc_ploter/volcano_plot_js.py +++ b/metax/taxafunc_ploter/volcano_plot_js.py @@ -63,7 +63,13 @@ def color_mapping(type_value): scatter_ultra_down = df[df['type'] == 'ultra-down'].apply(lambda p: {'name': p['label'], 'value': [p['log2FoldChange'], p[p_type]]}, axis=1) Scatter_normal = df[df['type'] == 'normal'].apply(lambda p: {'name': p['label'], 'value': [p['log2FoldChange'], p[p_type]]}, axis=1) - title = f'Volcano plot of {title_name} ({p_type} <= {pvalue}, {log2fc_min} <= log2FoldChange < {log2fc_max})' + # if ultra-up or ultra-down is not in the data, then don't show it in the title + if len(df[df['type'].isin(['ultra-up', 'ultra-down'])]) == 0: + log2fc_title = f'|log2FoldChange| >= {log2fc_min}' + else: + log2fc_title = f'{log2fc_min} <= |log2FoldChange| < {log2fc_max}' + + title = f'Volcano plot of {title_name} ({p_type} <= {pvalue}, {log2fc_title})' scatter = ( Scatter(init_opts=opts.InitOpts(width=f"{width*100}px", height=f"{height*100}px", theme=self.theme)) diff --git a/metax/utils/scripts/razor_sum.py b/metax/utils/scripts/razor_sum.py new file mode 100644 index 0000000..fef9b3a --- /dev/null +++ b/metax/utils/scripts/razor_sum.py @@ -0,0 +1,214 @@ +from collections import defaultdict +import pandas as pd +from tqdm import tqdm + + +class RazorSum: + def __init__(self, df, column_map): + self.df = df + self.column_map = column_map + self.res_intensity_dict = {} # store all sample to output + self.greedy_method = None # only used for razor method + self.mini_target_set = None + self.filtered_target_to_peptides = None + self.share_intensity = False + self.__multi_target_count = 0 + + + def sum_protein_intensity(self, greedy_method='heap'): + self.greedy_method = greedy_method + print('Start to sum protein intensity using method: [razor]') + if column_map['sample_list'] is None or len(column_map['sample_list']) == 0: + raise ValueError('Please provide [sample_list] in column_map for sum, e.g. ["Sample1", "Sample2", "Sample3"]') + # only extract the peptide and target columns + extract_cols = [self.column_map['peptide'], self.column_map['target']] + self.column_map['sample_list'] + self.df = self.df.loc[:, extract_cols] + + pep_to_target = self._create_pep_to_target_razor() + self._sum_target_intensity(pep_to_target) + + # show summary + print(f"Total peptides count: {len(pep_to_target)}") + self.__multi_target_count = self.__multi_target_count/len(sample_list) + print(f"Multi-target peptides count: {self.__multi_target_count} ({self.__multi_target_count / len(pep_to_target) * 100:.2f}%)") + + + res_df = pd.DataFrame.from_dict(self.res_intensity_dict) + res_df.fillna(0, inplace=True) + res_df.index.name = 'Target' + + print('Finish summing protein intensity') + + return res_df + + def get_mini_target_set(self, greedy_method='heap'): + self.greedy_method = greedy_method + print('Start to get minimum target set using method: [razor]') + # only extract the peptide and target columns + extract_cols = [self.column_map['peptide'], self.column_map['target']] + self.column_map['sample_list'] if self.column_map['sample_list'] else [] + # if NA in target column, or '', raise error + if self.df[self.column_map['target']].isna().any() or '' in self.df[self.column_map['target']].values: + raise ValueError(f'NA or empty value in target column: {self.column_map["target"]}') + + self.df = self.df.loc[:, extract_cols] + df = self.df.loc[:, [self.column_map['peptide'], self.column_map['target']]] + peptides = set(df[self.column_map['peptide']]) + target_to_peptides = self._create_target_to_peptides() + mini_target_set = self.find_minimum_target_set(peptides, target_to_peptides) + filtered_target_to_peptides = {target: target_to_peptides[target] for target in mini_target_set} + self.mini_target_set = mini_target_set + self.filtered_target_to_peptides = filtered_target_to_peptides + return self.mini_target_set + + def _create_pep_to_target_razor(self): + """ + Create a dictionary mapping peptides to targets based on a minimum target set. + + Returns: + dict: A dictionary mapping peptides to targets. + key: peptide + value: a list of targets + """ + self.get_mini_target_set(self.greedy_method) + + peptides = set(self.df[self.column_map['peptide']]) + filtered_target_to_peptides = self.filtered_target_to_peptides + + peptide_to_target = defaultdict(list) + for peptide in tqdm(peptides, desc="Assigning peptides to targets"): + possible_targets = [target for target, peps in filtered_target_to_peptides.items() if peptide in peps] + if possible_targets: + max_target_count = max(len(filtered_target_to_peptides[target]) for target in possible_targets) + best_targets = [target for target in possible_targets if len(filtered_target_to_peptides[target]) == max_target_count] + peptide_to_target[peptide].extend(best_targets) + + return peptide_to_target + + def _create_target_to_peptides(self): + """ + Create a dictionary mapping targets to peptides. + e.g. {'target1': {'peptide1', 'peptide2'}, 'target2': {'peptide1', 'peptide3'}} + + """ + df = self.df.loc[:, [self.column_map['peptide'], self.column_map['target']]] + target_to_peptides = defaultdict(set) + + for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Creating target to peptides mapping"): + sequence = row[self.column_map['peptide']] + targets = row[self.column_map['target']].split(';') + for target in targets: + target_to_peptides[target].add(sequence) + + return target_to_peptides + + def _sum_target_intensity(self, peptide_to_target): + for sample in tqdm(self.column_map['sample_list'], desc="Summing intensity"): + df_sample = self.df.loc[:, [self.column_map['peptide'], sample]] + df_sample.set_index(self.column_map['peptide'], inplace=True) + peptide_intensity_dict = df_sample.to_dict()[sample] + for peptide, targets in peptide_to_target.items(): + intensity = peptide_intensity_dict.get(peptide, 0) + self._update_output_dict(targets, sample, intensity) + + + def find_minimum_target_set(self, peptides, target_to_peptides): + target_to_peptides_copy = target_to_peptides.copy() + # print current target number + print(f'Current target number: {len(target_to_peptides_copy)}') + peptides_to_cover = set(peptides) + selected_targets = set() + method = self.greedy_method + + if method == 'greedy': + print('Start creating protein dict for "Set Cover Problem" with Greedy Approximation Algorithm') + with tqdm(total=len(peptides_to_cover), desc="Covering peptides") as pbar: + while peptides_to_cover: + best_protein = None + peptides_covered_by_best = set() + for protein, covered_peptides in target_to_peptides_copy.items(): + covered = peptides_to_cover & covered_peptides + if len(covered) > len(peptides_covered_by_best): + best_protein = protein + peptides_covered_by_best = covered + + if not best_protein: + break + + selected_targets.add(best_protein) + peptides_to_cover -= peptides_covered_by_best + target_to_peptides_copy.pop(best_protein) # remove the protein from the dict to speed up the process + pbar.update(len(peptides_covered_by_best)) + elif method == 'heap': + import heapq + target_coverage = {target: covered_peptides & peptides_to_cover + for target, covered_peptides in target_to_peptides_copy.items()} + target_heap = [(-len(covered), target) for target, covered in target_coverage.items()] + heapq.heapify(target_heap) + + with tqdm(total=len(peptides_to_cover), desc="Covering peptides") as pbar: + while peptides_to_cover: + while target_heap: + max_covered, best_target = heapq.heappop(target_heap) + if best_target in target_coverage: + peptides_covered_by_best = target_coverage.pop(best_target) + break + + if not best_target or not peptides_covered_by_best: + break + + selected_targets.add(best_target) + peptides_to_cover -= peptides_covered_by_best + pbar.update(len(peptides_covered_by_best)) + + for target in list(target_coverage.keys()): + if target_coverage[target] & peptides_covered_by_best: + target_coverage[target] -= peptides_covered_by_best + heapq.heappush(target_heap, (-len(target_coverage[target]), target)) + if not target_coverage[target]: + del target_coverage[target] + else: + raise ValueError(f"Invalid greedy method: {method}. Must be ['greedy' or 'heap']") + + + print(f'Minium target number: {len(selected_targets)}') + return selected_targets + + def _update_output_dict(self, target_list, sample_name, intensity): + if len(target_list) == 1: + target = target_list[0] + self.res_intensity_dict.setdefault(sample_name, {}).setdefault(target, 0) + self.res_intensity_dict[sample_name][target] += intensity + else: + if self.share_intensity: + intensity /= len(target_list) + for target in target_list: + self.res_intensity_dict.setdefault(sample_name, {}).setdefault(target, 0) + self.res_intensity_dict[sample_name][target] += intensity + + else: # assign the intensity to the 1st target + self.__multi_target_count += 1 + target = target_list[0] + self.res_intensity_dict.setdefault(sample_name, {}).setdefault(target, 0) + self.res_intensity_dict[sample_name][target] += intensity + +# Example usage: +# Assuming df is your pandas dataframe and column_map is your dictionary +if __name__ == '__main__': + df = pd.read_csv('OTF.tsv', sep='\t') + df_meta = pd.read_csv('meta.txt', sep='\t') + sample_list = df_meta['Samples'].unique().tolist() + sample_list = ["Intensity_" + sample for sample in sample_list] + + column_map = { + 'peptide': 'Sequence', + 'target': 'Proteins', + 'sample_list': sample_list # ['Sample1', 'Sample2', 'Sample3'] + } + sia = RazorSum(df, column_map) + + res_df = sia.sum_protein_intensity(greedy_method='heap') + res_df.to_csv('razor_protein_intensity.tsv', sep='\t') + + # or get minimum target set only + # mini_target_set = sia.get_mini_target_set(greedy_method='heap') + diff --git a/metax/utils/version.py b/metax/utils/version.py index 3896684..f7a9d51 100644 --- a/metax/utils/version.py +++ b/metax/utils/version.py @@ -1,2 +1,2 @@ -__version__ = '1.109.12' +__version__ = '1.110.0' API_version = '2' \ No newline at end of file