Skip to content

Commit

Permalink
Merge commit '68f22b50ae40126e6473717ca11de9a08b6f1815'
Browse files Browse the repository at this point in the history
  • Loading branch information
byemaxx committed Jun 27, 2024
2 parents 9deb712 + 68f22b5 commit 6191599
Show file tree
Hide file tree
Showing 13 changed files with 839 additions and 418 deletions.
18 changes: 18 additions & 0 deletions Docs/ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
# Version: 1.107.8
## Date: 2024-06-26
### Changes:
- Change:
- 1.Use the heap as the default data structure to apply razor method to sum peptide intensity to protein intensity.
- 2.Changed the update message to a dialog window to show the update message to avoid the update message is too long to show in the message box.

# Version: 1.107.7
## Date: 2024-06-24
### Changes:
- Change: Changed the method of summing peptiede intensity to protein intensity, changed the method "razor" to same as MaxQuant, and added a new method "rank".

# Version: 1.107.6
## Date: 2024-06-19
### Changes:
- New: added an option in Settings to enable user to set the color of the theme (white or dark) of the HTML plot.


# Version: 1.107.5
## Date: 2024-06-18
### Changes:
Expand Down
Binary file modified Docs/MetaX_Cookbook.assets/settings_page2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 12 additions & 4 deletions Docs/MetaX_Cookbook.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ The **Database Updater** allows updating the database built by the **Database Bu

### 1. Results from MAG Workflow

The peptide results use Metagenome-assembled genomes (MAGs) as the reference database for protein searches, e.g., MetaLab-MAG and other databases like MGnify.
The peptide results use Metagenome-assembled genomes (MAGs) as the reference database for protein searches, e.g., MetaLab-MAG, MetaLab-DIA and other workflows wich using MAG databases like MGnify or customized MAGs Database.

- Annotate the peptide to Operational Taxa-Functions (OTF) Table before analysis using the <u>Peptide Annotator</u>.

Expand Down Expand Up @@ -227,9 +227,16 @@ The Data Overview provides basic information about your data, such as the number

Click **Create Proteins Intensity Table** to sum peptides to proteins if the Protein column is in the original table.

- **Occam's Razor and Anti-Razor:** Methods available for inferring shared peptides.
1. Build the rank of proteins.
2. Choose the protein with a higher rank for the shared peptide.
- **Occam's Razor**, **Anti-Razor** and **Rank:** Methods available for inferring shared peptides.
- Razor:
1. Build a minimal set of proteins to cover all peptides.
2. For each peptide, choose the protein which has most peptides (if multiple proteins have the same number of peptides, share intensity to them).
- Anti-Razor:
- All proteins are shared the intensity of each peptide.
- Rank:
1. Build the rank of proteins.
2. Choose the protein with a higher rank for the shared peptide.


- **Methods to Build Protein Rank:**
- unique_counts: Use the counts of proteins inferred by unique peptides.
Expand Down Expand Up @@ -709,6 +716,7 @@ We can select <u>**meta**</u> <u>**groups**</u> or <u>**samples**</u> (default a
- The yellow dots are taxa, and the grey dots are functions, the size of the dots presents the intensity
- The red dots are the taxa we focused on
- The green dots are the functions we focused on
- More parameters can be set in **Dev**->**Settings**->**Others** (e.g. Nodes Shape, color, Line Style)

<img src="./MetaX_Cookbook.assets/taxa_func_network.png" alt="taxa_func_network" />

Expand Down
159 changes: 139 additions & 20 deletions utils/AnalyzerUtils/SumProteinIntensity.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,54 @@
# This file is used to sum the protein intensity for each sample
# Method: razor or anti-razor
# Method: razor, anti-razor or rank
# By sample: True or False
# Output: a dataframe with protein as index and sample as columns
##############################################
# USAGE:
# from utils.AnalyzerUtils.SumProteinIntensity import SumProteinIntensity
# out = SumProteinIntensity(sw)
# df1 = out.sum_protein_intensity(method='razor', by_sample=False, rank_method='count')
# df2 = out.sum_protein_intensity(method='razor', by_sample=False, rank_method='shared')
# df3 = out.sum_protein_intensity(method='razor', by_sample=False, rank_method='unique')
# df0 = out.sum_protein_intensity(method='razor')
# df1 = out.sum_protein_intensity(method='rank', by_sample=False, rank_method='all_counts')
# df2 = out.sum_protein_intensity(method='rank', by_sample=False, rank_method='shared_intensity')
# df3 = out.sum_protein_intensity(method='rank', by_sample=False, rank_method='unique_counts')
# df4 = out.sum_protein_intensity(method='anti-razor')
##############################################

from collections import defaultdict
import pandas as pd
from tqdm import tqdm


class SumProteinIntensity:
def __init__(self, taxa_func_analyzer):
self.tfa = taxa_func_analyzer
self.res_intensity_dict = {} #store all sample to output
self.rank_dict = {} #store the rank of protein intensity for each sample temporarily
self.rank_method = None
self.res_intensity_dict = {} # store all sample to output
self.rank_dict = {} # store the rank of protein intensity for each sample temporarily
self.rank_method = None # only used for rank method
self.extract_col_name = [self.tfa.peptide_col_name, self.tfa.protein_col_name] + self.tfa.sample_list
self.df = self.tfa.original_df.loc[:,self.extract_col_name]
self.df = self.tfa.original_df.loc[:, self.extract_col_name]
self._init_dicts()
self.greedy_method = None # only used for razor method


def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='unique_counts'):
def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='unique_counts', greedy_method='heap'):

if method not in ['razor', 'anti-razor']:
raise ValueError('Method must in ["razor", "anti-razor"]')
if method not in ['razor', 'anti-razor', 'rank']:
raise ValueError('Method must in ["razor", "anti-razor", "rank"]')
if rank_method not in ['shared_intensity', 'all_counts', 'unique_counts', 'unique_intensity']:
raise ValueError('Rank method must in ["shared_intensity", "all_counts", "unique_counts", "unique_intensity"]')

self.rank_method = rank_method
self.greedy_method = greedy_method

if method == 'razor':
if method == 'rank':
print(f"\n-------------Start to sum protein intensity using method: [{method}] by_sample: [{by_sample}] rank_method: [{rank_method}]-------------")
# make a dict to count the intensity of each protein, intensity sahred by peptides will be divided by the number of peptides
if by_sample:
for sample in self.tfa.sample_list:
# update the dict for each sample
print(f'Creating protein rank dict for [{sample}] by shared intensity', end='\r')
self._update_protein_rank_dict(sample_name = sample, rank_method = rank_method)
self._sum_protein_razor(sample, by_sample)
self._sum_protein_rank(sample, by_sample)

else: # without sample
# only need to create the dict once
Expand All @@ -51,8 +57,12 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un
self._update_protein_rank_dict(sample_name = None, rank_method = rank_method)

for sample in self.tfa.sample_list:
self._sum_protein_razor(sample, by_sample)

self._sum_protein_rank(sample, by_sample)
elif method == 'razor':
print('start to sum protein intensity using method: [razor]')
# use Set Cover Problem to get the protein list, then sum the intensity
pep_to_protein = self._create_pep_to_protein_razor()
self._sum_protein_razor(pep_to_protein)

elif method == 'anti-razor':
print(f"\n-------------Start to sum protein intensity using method: [{method}] by_sample: [True] rank_method: [Shared]-------------")
Expand All @@ -76,7 +86,119 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un

return res_df

# razor method
def find_minimum_protein_set(self, peptides, protein_to_peptides):
protein_to_peptides_copy = protein_to_peptides.copy()
peptides_to_cover = set(peptides)
selected_proteins = set()
method = self.greedy_method

if method == 'greedy':
print('Start creating protein dict for "Set Cover Problem" with Greedy Approximation Algorithm')
with tqdm(total=len(peptides_to_cover), desc="Covering peptides") as pbar:
while peptides_to_cover:
best_protein = None
peptides_covered_by_best = set()
for protein, covered_peptides in protein_to_peptides_copy.items():
covered = peptides_to_cover & covered_peptides
if len(covered) > len(peptides_covered_by_best):
best_protein = protein
peptides_covered_by_best = covered

if not best_protein:
break

selected_proteins.add(best_protein)
peptides_to_cover -= peptides_covered_by_best
protein_to_peptides_copy.pop(best_protein) # remove the protein from the dict to speed up the process
pbar.update(len(peptides_covered_by_best))
elif method == 'heap':
import heapq
print('Start creating protein dict for "Set Cover Problem" with Heap Optimization of Greedy Approximation Algorithm')
protein_coverage = {protein: covered_peptides & peptides_to_cover
for protein, covered_peptides in protein_to_peptides_copy.items()}
protein_heap = [(-len(covered), protein) for protein, covered in protein_coverage.items()]
heapq.heapify(protein_heap)

with tqdm(total=len(peptides_to_cover), desc="Covering peptides") as pbar:
while peptides_to_cover:
while protein_heap:
max_covered, best_protein = heapq.heappop(protein_heap)
if best_protein in protein_coverage:
peptides_covered_by_best = protein_coverage.pop(best_protein)
break

if not best_protein or not peptides_covered_by_best:
break

selected_proteins.add(best_protein)
peptides_to_cover -= peptides_covered_by_best
pbar.update(len(peptides_covered_by_best))

# update other proteins' coverage
for protein in list(protein_coverage.keys()):
if protein_coverage[protein] & peptides_covered_by_best:
protein_coverage[protein] -= peptides_covered_by_best
heapq.heappush(protein_heap, (-len(protein_coverage[protein]), protein))
if not protein_coverage[protein]:
del protein_coverage[protein]
else:
raise ValueError(f"Invalid greedy method: {method}. Must be ['greedy' or 'heap']")

return selected_proteins

def _create_pep_to_protein_razor(self) -> dict:
"""
Create a dictionary mapping peptides to proteins based on a minimum protein set.
Returns:
dict: A dictionary mapping peptides to proteins.
key: peptide
value: a list of proteins
"""

df = self.df.loc[:, [self.tfa.peptide_col_name, self.tfa.protein_col_name]]
# Create a dictionary mapping proteins to peptides
protein_to_peptides = defaultdict(set)
peptides = set(df[self.tfa.peptide_col_name])

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Creating protein to peptides mapping"):
sequence = row[self.tfa.peptide_col_name]
proteins = row[self.tfa.protein_col_name].split(';')
for protein in proteins:
protein_to_peptides[protein].add(sequence)

mini_protein_set = self.find_minimum_protein_set(peptides, protein_to_peptides)

# remove the proteins not in the mini_protein_set from the protein_to_peptides
filtered_protein_to_peptides = {protein: protein_to_peptides[protein] for protein in mini_protein_set}
# Assign each peptide to the protein that contains it with the highest peptide count
print('Assigning peptides to proteins')
peptide_to_protein = defaultdict(list)
for peptide in tqdm(peptides, desc="Assigning peptides to proteins"):
possible_proteins = [protein for protein, peps in filtered_protein_to_peptides.items() if peptide in peps]
if possible_proteins:
# 找到包含该肽最多的蛋白质
max_protein_count = max(len(filtered_protein_to_peptides[protein]) for protein in possible_proteins)
best_proteins = [protein for protein in possible_proteins if len(filtered_protein_to_peptides[protein]) == max_protein_count]
peptide_to_protein[peptide].extend(best_proteins)

return peptide_to_protein

def _sum_protein_razor(self, peptide_to_protein: dict):

for sample in tqdm(self.tfa.sample_list):
print(f'Assigning protein intensity for [{sample}]')
df = self.df.loc[:,[ self.tfa.peptide_col_name, sample]]
# create a dict to store the intensity of each peptide
df.set_index(self.tfa.peptide_col_name, inplace=True)
peptide_intensity_dict = df.to_dict()[sample]
for peptide, proteins in peptide_to_protein.items():
intensity = peptide_intensity_dict[peptide]
self._update_output_dict(proteins, sample, intensity)



def _init_dicts(self):
for sample in self.tfa.sample_list:
self.res_intensity_dict[sample] = {}
Expand Down Expand Up @@ -147,7 +269,7 @@ def _update_output_dict(self, protein_list: list, sample_name:str, intensity:flo
self.res_intensity_dict[sample_name][protein] = intensity


def _sum_protein_razor(self, sample_name:str, by_sample=False):
def _sum_protein_rank(self, sample_name:str, by_sample=False):
# print in one line
print(f'Asigning protein intensity for [{sample_name}]', end='\r')
df = self.df.loc[:,[ self.tfa.protein_col_name, sample_name]]
Expand Down Expand Up @@ -180,7 +302,4 @@ def _sum_protein_anti_razor(self, sample_name:str):
for row in df.itertuples():
proteins = row[1].split(';')
intensity = row[2]
self._update_output_dict(proteins, sample_name, intensity)



self._update_output_dict(proteins, sample_name, intensity)
Loading

0 comments on commit 6191599

Please sign in to comment.