From 5dfc08bdfa7d4dab11ce04f8029657c918249e64 Mon Sep 17 00:00:00 2001 From: Qing Date: Wed, 21 Aug 2024 23:44:51 -0400 Subject: [PATCH] - Fix: Fixed the bug of extrcting the peptides of taxa, funcs or taxa-funcs, when split the function items. --- Docs/ChangeLog.md | 7 +- metax/gui/main_gui.py | 27 +++---- metax/taxafunc_analyzer/analyzer.py | 75 ++++++++++++++++++- .../analyzer_utils/get_matrix.py | 15 ++-- metax/utils/version.py | 2 +- pyproject.toml | 2 +- 6 files changed, 98 insertions(+), 30 deletions(-) diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md index bfe5f69..a519131 100644 --- a/Docs/ChangeLog.md +++ b/Docs/ChangeLog.md @@ -1,9 +1,14 @@ +# Version: 1.111.6 +## Date: 2024-08-21 +### Changes: +- Fix: Fixed the bug of extrcting the peptides of taxa, funcs or taxa-funcs, when split the function items. + + # Version: 1.111.5 ## Date: 2024-08-21 ### Changes: - Change: Optimized the x-axis and y-axis labels of the heatmap plot to make the labels more clear. - # Version: 1.111.4 ## Date: 2024-08-21 ### Changes: diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py index 18a2d98..5bfde74 100644 --- a/metax/gui/main_gui.py +++ b/metax/gui/main_gui.py @@ -3479,28 +3479,19 @@ def plot_basic_list(self, plot_type='heatmap'): df = self.tfa.peptide_df.copy() else: + peptides_list = [] + if table_name == 'Taxa': - df = self.tfa.clean_df.loc[self.tfa.clean_df['Taxon'].isin(self.basic_heatmap_list)] - df.index = df[self.tfa.peptide_col_name] + for i in self.basic_heatmap_list: + peptides_list.extend(self.tfa.peptides_linked_dict['taxa'][i]) elif table_name == 'Functions': - df = self.tfa.clean_df.loc[self.tfa.clean_df[self.tfa.func_name].isin(self.basic_heatmap_list)] - df.index = df[self.tfa.peptide_col_name] + for i in self.basic_heatmap_list: + peptides_list.extend(self.tfa.peptides_linked_dict['func'][i]) elif table_name == 'Taxa-Functions': - df_list = [] for i in self.basic_heatmap_list: - taxon, func = i.split(' <') - func = func[:-1] - dft = self.tfa.clean_df.loc[(self.tfa.clean_df['Taxon'] == taxon) & (self.tfa.clean_df[self.tfa.func_name] == func)] - df_list.append(dft) - - if df_list: - df_all = pd.concat(df_list) - df_all.index = df_all[self.tfa.peptide_col_name] - df = df_all - else: - raise ValueError('No valid taxa-function belongs to the selected taxa-function!') + peptides_list.extend(self.tfa.peptides_linked_dict['taxa_func'][i]) elif table_name == 'Proteins': QMessageBox.warning(self.MainWindow, 'Warning', @@ -3511,9 +3502,9 @@ def plot_basic_list(self, plot_type='heatmap'): return else: # Peptide - df = self.tfa.peptide_df.copy() - df = df.loc[self.basic_heatmap_list] + peptides_list = self.basic_heatmap_list + df = self.tfa.peptide_df.loc[peptides_list] df = df[sample_list] else: diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py index 4c8ddbd..2707a05 100644 --- a/metax/taxafunc_analyzer/analyzer.py +++ b/metax/taxafunc_analyzer/analyzer.py @@ -70,11 +70,15 @@ def __init__( self.func_taxa_df: Optional[pd.DataFrame] = None self.taxa_func_linked_dict: Optional[Dict[str, List[tuple]]] = None self.func_taxa_linked_dict: Optional[Dict[str, List[tuple]]] = None + self.peptides_linked_dict = {'taxa': {}, 'func': {}, 'taxa_func': {}} self.protein_df: Optional[pd.DataFrame] = None self.any_df_mode = any_df_mode # if True, the consider the TaxaFunc df as other_df self.custom_df: Optional[pd.DataFrame] = None # other df, any df that user want to add self.outlier_status = {'peptide': None, 'taxa': None, 'func': None, 'taxa_func': None, 'protein': None, 'custom': None} + + self.split_func_status:bool = False + self.split_func_sep:str = '' # load function self.BasicStats = BasicStats(self) @@ -560,6 +564,7 @@ def split_func(self, taxa_func_df, split_func_params: dict = {'split_by': ',', ' num_splits = len(split_funcs_list) for new_func in split_funcs_list: + new_func = new_func.strip() split_row = row[sample_list] / num_splits if share_intensity else row[sample_list] split_row[func_col] = new_func split_row[taxon_col] = row[taxon_col] @@ -577,6 +582,65 @@ def split_func(self, taxa_func_df, split_func_params: dict = {'split_by': ',', ' return new_data + def create_peptides_dict_in_taxa_func(self, dfc): + """ + Creates a dictionary of peptides in taxa, func, and taxa_func. + Parameters: + dfc (DataFrame): The input DataFrame containing the peptide, taxon, and function columns. + Returns: + self.peptides_linked_dict (dict): A dictionary containing the peptides in taxa, func, and taxa_func. + """ + print("Creating peptides_linked_dict in taxa, func, and taxa_func...") + df = dfc.copy()[[self.peptide_col_name, 'Taxon', self.func_name]] + peptide_col = self.peptide_col_name + taxa_col = 'Taxon' + func_col = self.func_name + + peptides_in_taxa_func = {} + peptides_in_taxa = {} + peptides_in_func = {} + + if self.split_func_status: + for _, row in tqdm(df.iterrows(), total=len(df), desc="Creating peptides_dict"): + peptide = row[peptide_col] + taxa = row[taxa_col] + func_list = [f.strip() for f in row[func_col].split(self.split_func_sep)] + + if taxa not in peptides_in_taxa: + peptides_in_taxa[taxa] = [] + peptides_in_taxa[taxa].append(peptide) + + for f in func_list: + if f not in peptides_in_func: + peptides_in_func[f] = [] + peptides_in_func[f].append(peptide) + taxa_func = f'{taxa} <{f}>' + if taxa_func not in peptides_in_taxa_func: + peptides_in_taxa_func[taxa_func] = [] + peptides_in_taxa_func[taxa_func].append(peptide) + else: + for _, row in tqdm(df.iterrows(), total=len(df), desc="Creating peptides_dict"): + peptide = row[peptide_col] + taxa = row[taxa_col] + func = row[func_col] + + if taxa not in peptides_in_taxa: + peptides_in_taxa[taxa] = [] + peptides_in_taxa[taxa].append(peptide) + + if func not in peptides_in_func: + peptides_in_func[func] = [] + peptides_in_func[func].append(peptide) + + taxa_func = f'{taxa} <{func}>' + if taxa_func not in peptides_in_taxa_func: + peptides_in_taxa_func[taxa_func] = [] + peptides_in_taxa_func[taxa_func].append(peptide) + + + self.peptides_linked_dict = {'taxa': peptides_in_taxa, 'func': peptides_in_func, 'taxa_func': peptides_in_taxa_func} + return self.peptides_linked_dict + def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, processing_after_sum: bool = False, @@ -615,7 +679,10 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00, #! fllowing code is for the normal mode # reset outlier_status self.outlier_status = {'peptide': None, 'taxa': None, 'func': None, 'taxa_func': None} - + # reset split_func status + self.split_func_status = split_func + self.split_func_sep = split_func_params['split_by'] + df = self.original_df.copy() # perform data pre-processing if not processing_after_sum: @@ -727,7 +794,9 @@ def strip_taxa(x, level): df_taxa = df_taxa[df_taxa['peptide_num'] >= peptide_num_threshold['taxa']] print(f"Taxa number with '{level}' level, peptide_num >= [{peptide_num_threshold['taxa']}]: {df_taxa.shape[0]}") #-----Taxa Table End----- - + + #------create peptides_dict in taxa, func and taxa_func------ + self.create_peptides_dict_in_taxa_func(dfc) # ----- create taxa_func table ----- df_taxa_func = dfc.copy() @@ -879,7 +948,7 @@ def get_df(self, table_name:str = 'taxa'): 'processing_order': None}, peptide_num_threshold = {'taxa': 1, 'func': 1, 'taxa_func': 1}, keep_unknow_func=False, sum_protein=False, sum_protein_params = {'method': 'razor', 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap'}, - split_func=True, split_func_params = {'split_by': ';', 'share_intensity': False} + split_func=True, split_func_params = {'split_by': '|', 'share_intensity': False} ) sw.check_attributes() \ No newline at end of file diff --git a/metax/taxafunc_analyzer/analyzer_utils/get_matrix.py b/metax/taxafunc_analyzer/analyzer_utils/get_matrix.py index 01c77c4..acd6ead 100644 --- a/metax/taxafunc_analyzer/analyzer_utils/get_matrix.py +++ b/metax/taxafunc_analyzer/analyzer_utils/get_matrix.py @@ -4,8 +4,8 @@ class GetMatrix: def __init__(self, tfa): self.tfa = tfa - def get_intensity_matrix(self, func_name: str = None, taxon_name: str = None, - peptide_seq: str = None, sample_list: list = None, condition:list = None) -> pd.DataFrame: + def get_intensity_matrix(self, func_name: str|None = None, taxon_name: str|None = None, + peptide_seq: str|None = None, sample_list: list|None = None, condition:list|None = None) -> pd.DataFrame: # input: a taxon with its function, a function with its taxon, # and the peptides in the function or taxon # output: a matrix of the intensity of the taxon or function or peptide in each sample @@ -21,10 +21,13 @@ def get_intensity_matrix(self, func_name: str = None, taxon_name: str = None, if taxon_name is None: dft = dft[dft[self.tfa.func_name] == func_name] dft.set_index('Taxon', inplace=True) - if taxon_name is not None: - dft = self.tfa.clean_df[(self.tfa.clean_df['Taxon'] == taxon_name) & ( - self.tfa.clean_df[self.tfa.func_name] == func_name)] - dft.set_index(self.tfa.peptide_col_name, inplace=True) + + if taxon_name is not None: #all peptides in the taxon-function + # get the intensity matrix of the taxon with its function + taxa_func = f'{taxon_name} <{func_name}>' + peptides_list = self.tfa.peptides_linked_dict['taxa_func'][taxa_func] + dft = self.tfa.peptide_df.loc[peptides_list] + elif taxon_name is not None and peptide_seq is None: dft = self.tfa.func_taxa_df.copy() diff --git a/metax/utils/version.py b/metax/utils/version.py index 89c504f..7225fd1 100644 --- a/metax/utils/version.py +++ b/metax/utils/version.py @@ -1,2 +1,2 @@ -__version__ = '1.111.5' +__version__ = '1.111.6' API_version = '2' \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2eeba7f..ea0a2f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "MetaXTools" -version = "1.111.5" +version = "1.111.6" description = "MetaXTools is a novel tool for linking peptide sequences with taxonomic and functional information in Metaproteomics." readme = "README_PyPi.md" license = { text = "NorthOmics" }