Merge commit '68f22b50ae40126e6473717ca11de9a08b6f1815'

byemaxx · Jun 27, 2024 · 6191599 · 6191599
2 parents 9deb712 + 68f22b5
commit 6191599
Show file tree

Hide file tree

Showing 13 changed files with 839 additions and 418 deletions.
diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md
@@ -1,3 +1,21 @@
+# Version: 1.107.8
+## Date: 2024-06-26
+### Changes:
+- Change: 
+- 1.Use the heap as the default data structure to apply razor method to sum peptide intensity to protein intensity.
+- 2.Changed the update message to a dialog window to show the update message to avoid the update message is too long to show in the message box.
+
+# Version: 1.107.7
+## Date: 2024-06-24
+### Changes:
+- Change: Changed the method of summing peptiede intensity to protein intensity, changed the method "razor" to same as MaxQuant, and added a new method "rank".
+
+# Version: 1.107.6
+## Date: 2024-06-19
+### Changes:
+- New: added an option in Settings to enable user to set the color of the theme (white or dark) of the HTML plot.
+
+
 # Version: 1.107.5
 ## Date: 2024-06-18
 ### Changes:

diff --git a/Docs/MetaX_Cookbook.assets/settings_page2.png b/Docs/MetaX_Cookbook.assets/settings_page2.png
diff --git a/Docs/MetaX_Cookbook.md b/Docs/MetaX_Cookbook.md
@@ -116,7 +116,7 @@ The **Database Updater** allows updating the database built by the **Database Bu
 
 ### 1. Results from MAG Workflow
 
-The peptide results use Metagenome-assembled genomes (MAGs) as the reference database for protein searches, e.g., MetaLab-MAG and other databases like MGnify.
+The peptide results use Metagenome-assembled genomes (MAGs) as the reference database for protein searches, e.g., MetaLab-MAG, MetaLab-DIA and other workflows wich using MAG databases like MGnify or customized MAGs Database.
 
 - Annotate the peptide to Operational Taxa-Functions (OTF) Table before analysis using the <u>Peptide Annotator</u>.
 
@@ -227,9 +227,16 @@ The Data Overview provides basic information about your data, such as the number
 
 Click **Create Proteins Intensity Table** to sum peptides to proteins if the Protein column is in the original table.
 
-- **Occam's Razor and Anti-Razor:** Methods available for inferring shared peptides. 
-  1. Build the rank of proteins.
-  2. Choose the protein with a higher rank for the shared peptide.
+- **Occam's Razor**, **Anti-Razor** and **Rank:** Methods available for inferring shared peptides.
+  - Razor:
+    1. Build a minimal set of proteins to cover all peptides.
+    2. For each peptide, choose the protein which has most peptides (if multiple proteins have the same number of peptides, share intensity to them).
+  - Anti-Razor:
+    - All proteins are shared the intensity of each peptide.
+  - Rank:
+    1. Build the rank of proteins.
+    2. Choose the protein with a higher rank for the shared peptide.
+
 
 - **Methods to Build Protein Rank:**
   - unique_counts: Use the counts of proteins inferred by unique peptides.
@@ -709,6 +716,7 @@ We can select <u>**meta**</u> <u>**groups**</u> or <u>**samples**</u> (default a
   - The yellow dots are taxa, and the grey dots are functions, the size of the dots presents the intensity
   - The red dots are the taxa we focused on
   - The green dots are the functions we focused on
+- More parameters can be set in **Dev**->**Settings**->**Others** (e.g. Nodes Shape, color, Line Style)
 
 <img src="./MetaX_Cookbook.assets/taxa_func_network.png" alt="taxa_func_network"  />
 

diff --git a/utils/AnalyzerUtils/SumProteinIntensity.py b/utils/AnalyzerUtils/SumProteinIntensity.py
@@ -1,48 +1,54 @@
 # This file is used to sum the protein intensity for each sample
-# Method: razor or anti-razor
+# Method: razor, anti-razor or rank
 # By sample: True or False
 # Output: a dataframe with protein as index and sample as columns
 ############################################## 
 # USAGE:
 # from utils.AnalyzerUtils.SumProteinIntensity import SumProteinIntensity
 # out = SumProteinIntensity(sw)
-# df1 = out.sum_protein_intensity(method='razor', by_sample=False, rank_method='count')
-# df2 = out.sum_protein_intensity(method='razor', by_sample=False, rank_method='shared')
-# df3 = out.sum_protein_intensity(method='razor', by_sample=False, rank_method='unique')
+# df0 = out.sum_protein_intensity(method='razor')
+# df1 = out.sum_protein_intensity(method='rank', by_sample=False, rank_method='all_counts')
+# df2 = out.sum_protein_intensity(method='rank', by_sample=False, rank_method='shared_intensity')
+# df3 = out.sum_protein_intensity(method='rank', by_sample=False, rank_method='unique_counts')
 # df4 = out.sum_protein_intensity(method='anti-razor')
 ##############################################
 
+from collections import defaultdict
 import pandas as pd
+from tqdm import tqdm
+
 
 class SumProteinIntensity:
     def __init__(self, taxa_func_analyzer):
         self.tfa = taxa_func_analyzer
-        self.res_intensity_dict = {} #store all sample to output
-        self.rank_dict = {} #store the rank of protein intensity for each sample temporarily
-        self.rank_method = None
+        self.res_intensity_dict = {}  # store all sample to output
+        self.rank_dict = {}  # store the rank of protein intensity for each sample temporarily
+        self.rank_method = None  # only used for rank method
         self.extract_col_name = [self.tfa.peptide_col_name, self.tfa.protein_col_name] + self.tfa.sample_list
-        self.df = self.tfa.original_df.loc[:,self.extract_col_name]
+        self.df = self.tfa.original_df.loc[:, self.extract_col_name]
         self._init_dicts()
+        self.greedy_method = None  # only used for razor method
 
 
-    def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='unique_counts'):
+    def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='unique_counts', greedy_method='heap'):
 
-        if method not in ['razor', 'anti-razor']:
-            raise ValueError('Method must in ["razor", "anti-razor"]')
+        if method not in ['razor', 'anti-razor', 'rank']:
+            raise ValueError('Method must in ["razor", "anti-razor", "rank"]')
         if rank_method not in ['shared_intensity', 'all_counts', 'unique_counts', 'unique_intensity']:
             raise ValueError('Rank method must in ["shared_intensity", "all_counts", "unique_counts", "unique_intensity"]')
 
         self.rank_method = rank_method
+        self.greedy_method = greedy_method
 
-        if method == 'razor':
+        if method == 'rank':
             print(f"\n-------------Start to sum protein intensity using method: [{method}]  by_sample: [{by_sample}] rank_method: [{rank_method}]-------------")   
             # make a dict to count the intensity of each protein, intensity sahred by peptides will be divided by the number of peptides
             if by_sample:
                 for sample in self.tfa.sample_list:
                     # update the dict for each sample
                     print(f'Creating protein rank dict for [{sample}] by shared intensity', end='\r')
                     self._update_protein_rank_dict(sample_name = sample, rank_method = rank_method)
-                    self._sum_protein_razor(sample, by_sample)
+                    self._sum_protein_rank(sample, by_sample)
 
             else: # without sample
                 # only need to create the dict once
@@ -51,8 +57,12 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un
                 self._update_protein_rank_dict(sample_name = None, rank_method = rank_method)
 
                 for sample in self.tfa.sample_list:
-                    self._sum_protein_razor(sample, by_sample)
-
+                    self._sum_protein_rank(sample, by_sample)
+        elif method == 'razor':
+            print('start to sum protein intensity using method: [razor]')
+            # use Set Cover Problem to get the protein list, then sum the intensity
+            pep_to_protein = self._create_pep_to_protein_razor()
+            self._sum_protein_razor(pep_to_protein)
 
         elif method == 'anti-razor':
             print(f"\n-------------Start to sum protein intensity using method: [{method}]  by_sample: [True] rank_method: [Shared]-------------")    
@@ -76,7 +86,119 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un
 
         return res_df
 
+    # razor method
+    def find_minimum_protein_set(self, peptides, protein_to_peptides):
+        protein_to_peptides_copy = protein_to_peptides.copy()
+        peptides_to_cover = set(peptides)
+        selected_proteins = set()
+        method = self.greedy_method
+
+        if method == 'greedy':
+            print('Start creating protein dict for "Set Cover Problem" with Greedy Approximation Algorithm')
+            with tqdm(total=len(peptides_to_cover), desc="Covering peptides") as pbar:
+                while peptides_to_cover:
+                    best_protein = None
+                    peptides_covered_by_best = set()
+                    for protein, covered_peptides in protein_to_peptides_copy.items():
+                        covered = peptides_to_cover & covered_peptides
+                        if len(covered) > len(peptides_covered_by_best):
+                            best_protein = protein
+                            peptides_covered_by_best = covered
+
+                    if not best_protein:
+                        break
+
+                    selected_proteins.add(best_protein)
+                    peptides_to_cover -= peptides_covered_by_best
+                    protein_to_peptides_copy.pop(best_protein)  # remove the protein from the dict to speed up the process
+                    pbar.update(len(peptides_covered_by_best))
+        elif method == 'heap':
+            import heapq
+            print('Start creating protein dict for "Set Cover Problem" with Heap Optimization of Greedy Approximation Algorithm')
+            protein_coverage = {protein: covered_peptides & peptides_to_cover 
+                                for protein, covered_peptides in protein_to_peptides_copy.items()}
+            protein_heap = [(-len(covered), protein) for protein, covered in protein_coverage.items()]
+            heapq.heapify(protein_heap)
+
+            with tqdm(total=len(peptides_to_cover), desc="Covering peptides") as pbar:
+                while peptides_to_cover:
+                    while protein_heap:
+                        max_covered, best_protein = heapq.heappop(protein_heap)
+                        if best_protein in protein_coverage:
+                            peptides_covered_by_best = protein_coverage.pop(best_protein)
+                            break
+
+                    if not best_protein or not peptides_covered_by_best:
+                        break
+
+                    selected_proteins.add(best_protein)
+                    peptides_to_cover -= peptides_covered_by_best
+                    pbar.update(len(peptides_covered_by_best))
+
+                    # update other proteins' coverage
+                    for protein in list(protein_coverage.keys()):
+                        if protein_coverage[protein] & peptides_covered_by_best:
+                            protein_coverage[protein] -= peptides_covered_by_best
+                            heapq.heappush(protein_heap, (-len(protein_coverage[protein]), protein))
+                            if not protein_coverage[protein]:
+                                del protein_coverage[protein]
+        else:
+            raise ValueError(f"Invalid greedy method: {method}. Must be ['greedy' or 'heap']")
+
+        return selected_proteins
 
+    def _create_pep_to_protein_razor(self) -> dict:
+        """
+        Create a dictionary mapping peptides to proteins based on a minimum protein set.
+
+        Returns:
+            dict: A dictionary mapping peptides to proteins.
+            key: peptide
+            value: a list of proteins
+        """
+
+        df = self.df.loc[:, [self.tfa.peptide_col_name, self.tfa.protein_col_name]]
+        # Create a dictionary mapping proteins to peptides
+        protein_to_peptides = defaultdict(set)
+        peptides = set(df[self.tfa.peptide_col_name])
+
+        for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Creating protein to peptides mapping"):
+            sequence = row[self.tfa.peptide_col_name]
+            proteins = row[self.tfa.protein_col_name].split(';')
+            for protein in proteins:
+                protein_to_peptides[protein].add(sequence)
+
+        mini_protein_set = self.find_minimum_protein_set(peptides, protein_to_peptides)
+
+        # remove the proteins not in the mini_protein_set from the protein_to_peptides
+        filtered_protein_to_peptides = {protein: protein_to_peptides[protein] for protein in mini_protein_set}
+        # Assign each peptide to the protein that contains it with the highest peptide count
+        print('Assigning peptides to proteins')
+        peptide_to_protein = defaultdict(list)
+        for peptide in tqdm(peptides, desc="Assigning peptides to proteins"):
+            possible_proteins = [protein for protein, peps in filtered_protein_to_peptides.items() if peptide in peps]
+            if possible_proteins:
+                # 找到包含该肽最多的蛋白质
+                max_protein_count = max(len(filtered_protein_to_peptides[protein]) for protein in possible_proteins)
+                best_proteins = [protein for protein in possible_proteins if len(filtered_protein_to_peptides[protein]) == max_protein_count]
+                peptide_to_protein[peptide].extend(best_proteins)
+
+        return peptide_to_protein
+
+    def _sum_protein_razor(self, peptide_to_protein: dict):
+
+        for sample in tqdm(self.tfa.sample_list):
+            print(f'Assigning protein intensity for [{sample}]')
+            df = self.df.loc[:,[ self.tfa.peptide_col_name, sample]]
+            # create a dict to store the intensity of each peptide
+            df.set_index(self.tfa.peptide_col_name, inplace=True)
+            peptide_intensity_dict = df.to_dict()[sample]
+            for peptide, proteins in peptide_to_protein.items():
+                intensity = peptide_intensity_dict[peptide]
+                self._update_output_dict(proteins, sample, intensity)
+
+
+
     def _init_dicts(self):
         for sample in self.tfa.sample_list:
             self.res_intensity_dict[sample] = {}
@@ -147,7 +269,7 @@ def _update_output_dict(self, protein_list: list, sample_name:str, intensity:flo
                     self.res_intensity_dict[sample_name][protein] = intensity
 
 
-    def _sum_protein_razor(self, sample_name:str, by_sample=False):
+    def _sum_protein_rank(self, sample_name:str, by_sample=False):
         # print in one line
         print(f'Asigning protein intensity for [{sample_name}]', end='\r')
         df = self.df.loc[:,[ self.tfa.protein_col_name, sample_name]]
@@ -180,7 +302,4 @@ def _sum_protein_anti_razor(self, sample_name:str):
         for row in df.itertuples():
             proteins = row[1].split(';')
             intensity = row[2]
-            self._update_output_dict(proteins, sample_name, intensity)
-
-
-
+            self._update_output_dict(proteins, sample_name, intensity)