Skip to content

Commit

Permalink
1.Use the heap as the default data structure to apply razor method to…
Browse files Browse the repository at this point in the history
… sum peptide intensity to protein intensity.

2.Changed the update message to a dialog window to show the update message to avoid the update message is too long to show in the message box.
  • Loading branch information
byemaxx committed Jun 26, 2024
1 parent a2c1b7c commit 9597752
Show file tree
Hide file tree
Showing 10 changed files with 642 additions and 465 deletions.
7 changes: 7 additions & 0 deletions Docs/ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# Version: 1.107.8
## Date: 2024-06-26
### Changes:
- Change:
- 1.Use the heap as the default data structure to apply razor method to sum peptide intensity to protein intensity.
- 2.Changed the update message to a dialog window to show the update message to avoid the update message is too long to show in the message box.

# Version: 1.107.7
## Date: 2024-06-24
### Changes:
Expand Down
2 changes: 1 addition & 1 deletion Docs/MetaX_Cookbook.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ Click **Create Proteins Intensity Table** to sum peptides to proteins if the Pro

- **Occam's Razor**, **Anti-Razor** and **Rank:** Methods available for inferring shared peptides.
- Razor:
1. Build a minimal set of proteins to cover all peptides (Set Cover Problem).
1. Build a minimal set of proteins to cover all peptides.
2. For each peptide, choose the protein which has most peptides (if multiple proteins have the same number of peptides, share intensity to them).
- Anti-Razor:
- All proteins are shared the intensity of each peptide.
Expand Down
86 changes: 62 additions & 24 deletions utils/AnalyzerUtils/SumProteinIntensity.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,24 @@
class SumProteinIntensity:
def __init__(self, taxa_func_analyzer):
self.tfa = taxa_func_analyzer
self.res_intensity_dict = {} #store all sample to output
self.rank_dict = {} #store the rank of protein intensity for each sample temporarily
self.rank_method = None # only used for rank method
self.res_intensity_dict = {} # store all sample to output
self.rank_dict = {} # store the rank of protein intensity for each sample temporarily
self.rank_method = None # only used for rank method
self.extract_col_name = [self.tfa.peptide_col_name, self.tfa.protein_col_name] + self.tfa.sample_list
self.df = self.tfa.original_df.loc[:,self.extract_col_name]
self.df = self.tfa.original_df.loc[:, self.extract_col_name]
self._init_dicts()
self.greedy_method = None # only used for razor method


def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='unique_counts'):
def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='unique_counts', greedy_method='heap'):

if method not in ['razor', 'anti-razor', 'rank']:
raise ValueError('Method must in ["razor", "anti-razor", "rank"]')
if rank_method not in ['shared_intensity', 'all_counts', 'unique_counts', 'unique_intensity']:
raise ValueError('Rank method must in ["shared_intensity", "all_counts", "unique_counts", "unique_intensity"]')

self.rank_method = rank_method
self.greedy_method = greedy_method

if method == 'rank':
print(f"\n-------------Start to sum protein intensity using method: [{method}] by_sample: [{by_sample}] rank_method: [{rank_method}]-------------")
Expand Down Expand Up @@ -84,27 +86,20 @@ def sum_protein_intensity(self, method='razor', by_sample=False, rank_method='un

return res_df


def _create_pep_to_protein_razor(self) -> dict:
"""
Create a dictionary mapping peptides to proteins based on a minimum protein set.
Returns:
dict: A dictionary mapping peptides to proteins.
key: peptide
value: a list of proteins
"""
# crate a function to find the minimum protein set
def find_minimum_protein_set(peptides, protein_to_peptides):
print('Start to create protein dict using "Set Cover Problem"')
peptides_to_cover = set(peptides)
selected_proteins = set()

# razor method
def find_minimum_protein_set(self, peptides, protein_to_peptides):
protein_to_peptides_copy = protein_to_peptides.copy()
peptides_to_cover = set(peptides)
selected_proteins = set()
method = self.greedy_method

if method == 'greedy':
print('Start creating protein dict for "Set Cover Problem" with Greedy Approximation Algorithm')
with tqdm(total=len(peptides_to_cover), desc="Covering peptides") as pbar:
while peptides_to_cover:
best_protein = None
peptides_covered_by_best = set()
for protein, covered_peptides in protein_to_peptides.items():
for protein, covered_peptides in protein_to_peptides_copy.items():
covered = peptides_to_cover & covered_peptides
if len(covered) > len(peptides_covered_by_best):
best_protein = protein
Expand All @@ -115,9 +110,52 @@ def find_minimum_protein_set(peptides, protein_to_peptides):

selected_proteins.add(best_protein)
peptides_to_cover -= peptides_covered_by_best
protein_to_peptides_copy.pop(best_protein) # remove the protein from the dict to speed up the process
pbar.update(len(peptides_covered_by_best))
elif method == 'heap':
import heapq
print('Start creating protein dict for "Set Cover Problem" with Heap Optimization of Greedy Approximation Algorithm')
protein_coverage = {protein: covered_peptides & peptides_to_cover
for protein, covered_peptides in protein_to_peptides_copy.items()}
protein_heap = [(-len(covered), protein) for protein, covered in protein_coverage.items()]
heapq.heapify(protein_heap)

with tqdm(total=len(peptides_to_cover), desc="Covering peptides") as pbar:
while peptides_to_cover:
while protein_heap:
max_covered, best_protein = heapq.heappop(protein_heap)
if best_protein in protein_coverage:
peptides_covered_by_best = protein_coverage.pop(best_protein)
break

if not best_protein or not peptides_covered_by_best:
break

return selected_proteins
selected_proteins.add(best_protein)
peptides_to_cover -= peptides_covered_by_best
pbar.update(len(peptides_covered_by_best))

# update other proteins' coverage
for protein in list(protein_coverage.keys()):
if protein_coverage[protein] & peptides_covered_by_best:
protein_coverage[protein] -= peptides_covered_by_best
heapq.heappush(protein_heap, (-len(protein_coverage[protein]), protein))
if not protein_coverage[protein]:
del protein_coverage[protein]
else:
raise ValueError(f"Invalid greedy method: {method}. Must be ['greedy' or 'heap']")

return selected_proteins

def _create_pep_to_protein_razor(self) -> dict:
"""
Create a dictionary mapping peptides to proteins based on a minimum protein set.
Returns:
dict: A dictionary mapping peptides to proteins.
key: peptide
value: a list of proteins
"""

df = self.df.loc[:, [self.tfa.peptide_col_name, self.tfa.protein_col_name]]
# Create a dictionary mapping proteins to peptides
Expand All @@ -130,7 +168,7 @@ def find_minimum_protein_set(peptides, protein_to_peptides):
for protein in proteins:
protein_to_peptides[protein].add(sequence)

mini_protein_set = find_minimum_protein_set(peptides, protein_to_peptides)
mini_protein_set = self.find_minimum_protein_set(peptides, protein_to_peptides)

# remove the proteins not in the mini_protein_set from the protein_to_peptides
filtered_protein_to_peptides = {protein: protein_to_peptides[protein] for protein in mini_protein_set}
Expand Down
20 changes: 17 additions & 3 deletions utils/GUI.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,13 +650,21 @@ def show_settings_window(self):
self.settings_dialog.setModal(False)
layout = QVBoxLayout(self.settings_dialog)
self.settings_dialog.resize(900, 600)

settings_widget = SettingsWidget(self.settings_dialog, self.update_branch, self.auto_check_update)
# General settings
settings_widget = SettingsWidget(
parent=self.settings_dialog,
update_branch=self.update_branch,
auto_check_update=self.auto_check_update,
QSettings=self.settings,
)
settings_widget.update_mode_changed.connect(self.on_update_mode_changed)
settings_widget.auto_check_update_changed.connect(self.on_auto_check_update_changed)
# plotting parameters
settings_widget.heatmap_params_dict_changed.connect(self.on_heatmap_params_changed)
settings_widget.tf_link_net_params_dict_changed.connect(self.on_tf_link_net_params_changed)
settings_widget.html_theme_changed.connect(self.on_html_theme_changed)
# Other settings
settings_widget.protein_infer_method_changed.connect(self.on_protein_infer_method_changed)

layout.addWidget(settings_widget)
self.settings_dialog.setLayout(layout)
Expand Down Expand Up @@ -686,6 +694,11 @@ def on_tf_link_net_params_changed(self, params_dict):
def on_html_theme_changed(self, theme):
self.html_theme = theme
print(f"HTML theme changed to: {theme}")

def on_protein_infer_method_changed(self, method):
#save to settings
self.settings.setValue("protein_infer_greedy_mode", method)
print(f"Protein infering razor mode changed to: {method}")

############### basic function End ###############

Expand Down Expand Up @@ -2478,7 +2491,8 @@ def set_multi_table(self, restore_taxafunc=False, saved_obj=None):
sum_protein_params = {
'method': self.comboBox_method_of_protein_inference.currentText(),
'by_sample': self.checkBox_infrence_protein_by_sample.isChecked(),
'rank_method' :self.comboBox_protein_ranking_method.currentText()
'rank_method' :self.comboBox_protein_ranking_method.currentText(),
'greedy_method': self.settings.value('protein_infer_greedy_mode', 'heap')
}


Expand Down
Loading

0 comments on commit 9597752

Please sign in to comment.