- Fix: Fixed the bug of extrcting the peptides of taxa, funcs or taxa…

…-funcs, when split the function items.
byemaxx · Aug 22, 2024 · 5dfc08b · 5dfc08b
1 parent 4e3b361
commit 5dfc08b
Show file tree

Hide file tree

Showing 6 changed files with 98 additions and 30 deletions.
diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md
@@ -1,9 +1,14 @@
+# Version: 1.111.6
+## Date: 2024-08-21
+### Changes:
+- Fix: Fixed the bug of extrcting the peptides of taxa, funcs or taxa-funcs, when split the function items.
+
+
 # Version: 1.111.5
 ## Date: 2024-08-21
 ### Changes:
 - Change: Optimized the x-axis and y-axis labels of the heatmap plot to make the labels more clear.
 
-
 # Version: 1.111.4
 ## Date: 2024-08-21
 ### Changes:

diff --git a/metax/gui/main_gui.py b/metax/gui/main_gui.py
@@ -3479,28 +3479,19 @@ def plot_basic_list(self, plot_type='heatmap'):
                 df = self.tfa.peptide_df.copy()
 
             else:
+                peptides_list = []
+
                 if table_name == 'Taxa':
-                    df = self.tfa.clean_df.loc[self.tfa.clean_df['Taxon'].isin(self.basic_heatmap_list)]
-                    df.index = df[self.tfa.peptide_col_name]
+                    for i in self.basic_heatmap_list:
+                        peptides_list.extend(self.tfa.peptides_linked_dict['taxa'][i])
 
                 elif table_name == 'Functions':
-                    df = self.tfa.clean_df.loc[self.tfa.clean_df[self.tfa.func_name].isin(self.basic_heatmap_list)]
-                    df.index = df[self.tfa.peptide_col_name]
+                    for i in self.basic_heatmap_list:
+                        peptides_list.extend(self.tfa.peptides_linked_dict['func'][i])
 
                 elif table_name == 'Taxa-Functions':
-                    df_list = [] 
                     for i in self.basic_heatmap_list:
-                        taxon, func = i.split(' <')
-                        func = func[:-1] 
-                        dft = self.tfa.clean_df.loc[(self.tfa.clean_df['Taxon'] == taxon) & (self.tfa.clean_df[self.tfa.func_name] == func)]
-                        df_list.append(dft)
-
-                    if df_list:  
-                        df_all = pd.concat(df_list)
-                        df_all.index = df_all[self.tfa.peptide_col_name] 
-                        df = df_all
-                    else:
-                        raise ValueError('No valid taxa-function belongs to the selected taxa-function!')
+                        peptides_list.extend(self.tfa.peptides_linked_dict['taxa_func'][i])
 
                 elif table_name == 'Proteins':
                     QMessageBox.warning(self.MainWindow, 'Warning',
@@ -3511,9 +3502,9 @@ def plot_basic_list(self, plot_type='heatmap'):
                     return
 
                 else: # Peptide
-                    df = self.tfa.peptide_df.copy()
-                    df = df.loc[self.basic_heatmap_list]
+                    peptides_list = self.basic_heatmap_list
 
+                df = self.tfa.peptide_df.loc[peptides_list]
                 df = df[sample_list]
 
         else:

diff --git a/metax/taxafunc_analyzer/analyzer.py b/metax/taxafunc_analyzer/analyzer.py
@@ -70,11 +70,15 @@ def __init__(
         self.func_taxa_df: Optional[pd.DataFrame] = None
         self.taxa_func_linked_dict: Optional[Dict[str, List[tuple]]] = None
         self.func_taxa_linked_dict: Optional[Dict[str, List[tuple]]] = None
+        self.peptides_linked_dict = {'taxa': {}, 'func': {}, 'taxa_func': {}}
         self.protein_df: Optional[pd.DataFrame] = None
         self.any_df_mode = any_df_mode  # if True, the consider the TaxaFunc df as other_df
         self.custom_df: Optional[pd.DataFrame] = None # other df, any df that user want to add
         self.outlier_status = {'peptide': None, 'taxa': None, 'func': None,
                                'taxa_func': None, 'protein': None, 'custom': None}
+
+        self.split_func_status:bool = False
+        self.split_func_sep:str = ''
 
         # load function
         self.BasicStats = BasicStats(self)
@@ -560,6 +564,7 @@ def split_func(self, taxa_func_df, split_func_params: dict = {'split_by': ',', '
             num_splits = len(split_funcs_list)
 
             for new_func in split_funcs_list:
+                new_func = new_func.strip()
                 split_row = row[sample_list] / num_splits if share_intensity else row[sample_list]
                 split_row[func_col] = new_func
                 split_row[taxon_col] = row[taxon_col]
@@ -577,6 +582,65 @@ def split_func(self, taxa_func_df, split_func_params: dict = {'split_by': ',', '
 
         return new_data
 
+    def create_peptides_dict_in_taxa_func(self, dfc):
+        """
+        Creates a dictionary of peptides in taxa, func, and taxa_func.
+        Parameters:
+            dfc (DataFrame): The input DataFrame containing the peptide, taxon, and function columns.
+        Returns:
+            self.peptides_linked_dict (dict): A dictionary containing the peptides in taxa, func, and taxa_func.
+        """
+        print("Creating peptides_linked_dict in taxa, func, and taxa_func...")
+        df = dfc.copy()[[self.peptide_col_name, 'Taxon', self.func_name]]
+        peptide_col = self.peptide_col_name
+        taxa_col = 'Taxon'
+        func_col = self.func_name
+
+        peptides_in_taxa_func = {}
+        peptides_in_taxa = {}
+        peptides_in_func = {}
+
+        if self.split_func_status:
+            for _, row in tqdm(df.iterrows(), total=len(df), desc="Creating peptides_dict"):
+                peptide = row[peptide_col]
+                taxa = row[taxa_col]
+                func_list = [f.strip() for f in row[func_col].split(self.split_func_sep)]
+
+                if taxa not in peptides_in_taxa:
+                    peptides_in_taxa[taxa] = []
+                peptides_in_taxa[taxa].append(peptide)
+
+                for f in func_list:
+                    if f not in peptides_in_func:
+                        peptides_in_func[f] = []
+                    peptides_in_func[f].append(peptide)
+                    taxa_func = f'{taxa} <{f}>'
+                    if taxa_func not in peptides_in_taxa_func:
+                        peptides_in_taxa_func[taxa_func] = []
+                    peptides_in_taxa_func[taxa_func].append(peptide)
+        else:
+            for _, row in tqdm(df.iterrows(), total=len(df), desc="Creating peptides_dict"):
+                peptide = row[peptide_col]
+                taxa = row[taxa_col]
+                func = row[func_col]
+
+                if taxa not in peptides_in_taxa:
+                    peptides_in_taxa[taxa] = []
+                peptides_in_taxa[taxa].append(peptide)
+
+                if func not in peptides_in_func:
+                    peptides_in_func[func] = []
+                peptides_in_func[func].append(peptide)
+
+                taxa_func = f'{taxa} <{func}>'
+                if taxa_func not in peptides_in_taxa_func:
+                    peptides_in_taxa_func[taxa_func] = []
+                peptides_in_taxa_func[taxa_func].append(peptide)
+
+
+        self.peptides_linked_dict = {'taxa': peptides_in_taxa, 'func': peptides_in_func, 'taxa_func': peptides_in_taxa_func}
+        return self.peptides_linked_dict
+
 
     def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00,
                          processing_after_sum: bool = False,
@@ -615,7 +679,10 @@ def set_multi_tables(self, level: str = 's', func_threshold:float = 1.00,
         #! fllowing code is for the normal mode
         # reset outlier_status
         self.outlier_status = {'peptide': None, 'taxa': None, 'func': None, 'taxa_func': None}
-
+        # reset split_func status
+        self.split_func_status = split_func
+        self.split_func_sep = split_func_params['split_by']
+
         df = self.original_df.copy()
         # perform data pre-processing
         if not processing_after_sum:
@@ -727,7 +794,9 @@ def strip_taxa(x, level):
         df_taxa = df_taxa[df_taxa['peptide_num'] >= peptide_num_threshold['taxa']]
         print(f"Taxa number with '{level}' level, peptide_num >= [{peptide_num_threshold['taxa']}]: {df_taxa.shape[0]}")
         #-----Taxa Table End-----
-
+
+        #------create peptides_dict in taxa, func and taxa_func------
+        self.create_peptides_dict_in_taxa_func(dfc)
 
         # ----- create taxa_func table -----
         df_taxa_func = dfc.copy()
@@ -879,7 +948,7 @@ def get_df(self, table_name:str = 'taxa'):
                                                             'processing_order': None},
                     peptide_num_threshold = {'taxa': 1, 'func': 1, 'taxa_func': 1},
                     keep_unknow_func=False, sum_protein=False, sum_protein_params = {'method': 'razor', 'by_sample': False, 'rank_method': 'unique_counts', 'greedy_method': 'heap'},
-                    split_func=True, split_func_params = {'split_by': ';', 'share_intensity': False}
+                    split_func=True, split_func_params = {'split_by': '|', 'share_intensity': False}
                     )
 
     sw.check_attributes()
diff --git a/metax/taxafunc_analyzer/analyzer_utils/get_matrix.py b/metax/taxafunc_analyzer/analyzer_utils/get_matrix.py
@@ -4,8 +4,8 @@ class GetMatrix:
     def __init__(self, tfa):
         self.tfa = tfa
 
-    def get_intensity_matrix(self, func_name: str = None, taxon_name: str = None,
-                             peptide_seq: str = None, sample_list: list = None, condition:list = None) -> pd.DataFrame:
+    def get_intensity_matrix(self, func_name: str|None = None, taxon_name: str|None  = None,
+                             peptide_seq: str|None  = None, sample_list: list|None  = None, condition:list|None  = None) -> pd.DataFrame:
         # input: a taxon with its function, a function with its taxon,
         # and the peptides in the function or taxon
         # output: a matrix of the intensity of the taxon or function or peptide in each sample
@@ -21,10 +21,13 @@ def get_intensity_matrix(self, func_name: str = None, taxon_name: str = None,
             if taxon_name is None:
                 dft = dft[dft[self.tfa.func_name] == func_name]
                 dft.set_index('Taxon', inplace=True)
-            if taxon_name is not None:
-                dft = self.tfa.clean_df[(self.tfa.clean_df['Taxon'] == taxon_name) & (
-                    self.tfa.clean_df[self.tfa.func_name] == func_name)]
-                dft.set_index(self.tfa.peptide_col_name, inplace=True)
+
+            if taxon_name is not None: #all peptides in the taxon-function
+                # get the intensity matrix of the taxon with its function
+                taxa_func = f'{taxon_name} <{func_name}>'
+                peptides_list = self.tfa.peptides_linked_dict['taxa_func'][taxa_func]
+                dft = self.tfa.peptide_df.loc[peptides_list]
+
 
         elif taxon_name is not None and peptide_seq is None:
             dft = self.tfa.func_taxa_df.copy()

diff --git a/metax/utils/version.py b/metax/utils/version.py
@@ -1,2 +1,2 @@
-__version__ = '1.111.5'
+__version__ = '1.111.6'
 API_version = '2'
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "MetaXTools"
-version = "1.111.5"
+version = "1.111.6"
 description = "MetaXTools is a novel tool for linking peptide sequences with taxonomic and functional information in Metaproteomics."
 readme = "README_PyPi.md"
 license = { text = "NorthOmics" }