Skip to content
This repository has been archived by the owner on Nov 5, 2024. It is now read-only.

Change input of Peptonizer to (peptide, confidence_score) maps #5

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion peptonizer/peptonizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from .zero_lookahead_belief_propagation import run_belief_propagation, ZeroLookaheadProgressListener
from .plot_results import plot_peptonizer_results
from .parsers import parse_pout, parse_ms2rescore_output
from .parsers import parse_peptide_tsv
from .unipept_get_taxonomy_from_pout import fetch_unipept_taxon_information
from .weight_taxa import perform_taxa_weighing
from .factor_graph_generation import generate_pepgm_graph
from .extract_taxon_scores import extract_taxon_scores
from .analyse_grid_search import find_best_parameters, ParameterSet
from .taxa_clustering import cluster_taxa_based_on_similarity
58 changes: 58 additions & 0 deletions peptonizer/peptonizer/analyse_grid_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pandas as pd
import rbo

from scipy.stats import entropy
from typing import List, NamedTuple, Tuple


class ParameterSet(NamedTuple):
"""
Represents a set of parameters that have been used for a Peptonizer analysis to tweak the behaviour of the belief
propagation algorithm.
"""
alpha: float
beta: float
prior: float


def compute_goodness(df: pd.DataFrame, taxid_weights: pd.DataFrame):
tax_ids = df.loc[df['type'] == 'taxon'].copy()
tax_ids.loc[:, 'score'] = pd.to_numeric(tax_ids['score'], downcast = 'float')
tax_ids.sort_values('score', ascending = False, inplace = True)
tax_ids.dropna(inplace = True)

# Compute entropy of the posterior probability distribution.
computed_entropy = entropy(tax_ids['score'].tolist())

# Compute the rank based similarity between the weight sorted taxa and the score sorted ID results.

return rbo.RankingSimilarity(
taxid_weights['HigherTaxa'].values,
[int(i) for i in tax_ids['ID'].values]
).rbo() * (1 / computed_entropy ** 2)


def find_best_parameters(results: List[Tuple[pd.DataFrame, ParameterSet]], taxid_weights: pd.DataFrame):
"""
Given the dataframes that have been run through the Belief Propagation Algorithm before and the matching parameter
sets, compute a goodness metric for each of these dataframes and returns the ParameterSet that resulted in the
highest goodness value.

:param results: A list of tuples each holding two things:
1. A dataframe containing taxa and their associated scores after running the belief propagation algorithm
2. The parameter values that where used during the belief propagation algorithm for this set of taxa
:param taxid_weights: A dataframe containing taxa and their corresponding 'scaled weights', as computed by the]
weight_taxa step.
"""
params = []
goodness_list = []

for df, param_set in results:
goodness_list.append(compute_goodness(df, taxid_weights))
params.append(param_set)

metrics_params = zip(goodness_list, params)
sorted_metric_param_pairs = sorted(metrics_params, reverse = True)

# Return the ParameterSet that's associated with the highest computed goodness metric.
return sorted_metric_param_pairs[0][1]
27 changes: 4 additions & 23 deletions peptonizer/peptonizer/factor_graph_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def create_from_taxa_weights(self, taxa_weights):
lambda row: (
row["sequence"],
{
"InitialBelief_0": float(row["score"]),
"InitialBelief_1": 1 - float(row["score"]),
"InitialBelief_0": 1 - float(row["score"]),
"InitialBelief_1": float(row["score"]),
"category": "peptide",
},
),
Expand All @@ -42,14 +42,7 @@ def create_from_taxa_weights(self, taxa_weights):
intermediate_graph.add_nodes_from(peptide_attributes)
intermediate_graph.add_nodes_from(taxa_attributes)

# cluster the resulting graph with the louvain algorithm
communities = nx.community.louvain_communities(intermediate_graph, seed=12345)

# separate the graph into its communities and enter into same graph object
for i, community in enumerate(communities):
subgraph = intermediate_graph.subgraph(community)
self.add_edges_from(subgraph.edges)

self.add_edges_from(intermediate_graph.edges)
self.add_nodes_from(peptide_attributes)
self.add_nodes_from(taxa_attributes)

Expand Down Expand Up @@ -260,23 +253,11 @@ def fill_in_priors(self, prior):
{node[0]: {"InitialBelief_0": 1 - prior, "InitialBelief_1": prior}},
)


def generate_ct_factor_graphs(list_of_factor_graphs, graph_type="Taxons"):
if type(list_of_factor_graphs) is not list:
list_of_factor_graphs = [list_of_factor_graphs]
for graph in list_of_factor_graphs:
ct_factor_graph = CTFactorGraph(
graph, graph_type
) # ListOfCTFactorGraphs.append(CTFactorGraph(Graph,GraphType))
# TODO Tanja: shouldn't this return all factor graphs???
return ct_factor_graph


def generate_pepgm_graph(
taxa_weights_data_frame: pd.DataFrame
) -> CTFactorGraph:
taxon_graph = TaxonGraph()
taxon_graph.create_from_taxa_weights(taxa_weights_data_frame)
factor_graph = FactorGraph()
factor_graph.construct_from_existing_graph(taxon_graph)
return generate_ct_factor_graphs(factor_graph)
return CTFactorGraph(factor_graph, "Taxons")
128 changes: 23 additions & 105 deletions peptonizer/peptonizer/parsers.py
Original file line number Diff line number Diff line change
@@ -1,114 +1,32 @@
import re
from typing import Dict, Tuple

from typing import Dict, Tuple, List


def parse_pout(pout_file: str, fdr_threshold: float) -> Dict[str, Dict[str, float | int]]:
def parse_peptide_tsv(tsv_content: str) -> Tuple[Dict[str, float], Dict[str, int]]:
"""
Parses the ms2rescore pout file for peptides, psm numbers and peptide scores.

Note: this code was adapted from the Pout2Prot tool.
Parses a TSV string with columns 'peptide' and 'score', where peptides can occur multiple times.

:param pout_file: str, content of pout files that needs to be parsed. Note: these should not be paths to pout
files, but the contents of these files already!
:param fdr_threshold: float, FDR threshold below which psms are kept
:return: dict, peptides:[score,#psms]
"""

pep_score = dict()
pep_psm = dict()
pep_score_psm = dict()
:param tsv_content: str, content of a TSV file with 'peptide' and 'score' columns.
:return: Tuple containing two dictionaries:
- pep_score: Maps peptide sequence to score (float).
- pep_psm_counts: Maps peptide sequence to the count of occurrences.
"""
pep_score: Dict[str, float] = {}
pep_psm_counts: Dict[str, int] = {}

# Skip header, so start from idx 1
for line in pout_file.splitlines()[1:]:
line = line.rstrip()
# skip empty lines
if line == "":
# Process each line in the TSV content, skipping the header
for line in tsv_content.splitlines()[1:]:
if line.strip() == "": # Skip empty lines
continue
splitted_line = line.split("\t", maxsplit=5)
assert (
len(splitted_line) >= 6
), "Input file is wrongly formatted. Make sure that the input is a valid .pout file."
psm_id, _, q, pep, peptide, _ = splitted_line
if float(q) < fdr_threshold:
peptide = re.sub("\[.*?\]", "", peptide)
peptide = peptide.split(".")[1]
# update pep_psm
if peptide not in pep_psm.keys():
pep_psm[peptide] = set()
pep_psm[peptide].add(psm_id)
else:
pep_psm[peptide].add(psm_id)
# update pep_score
if peptide not in pep_score.keys():
if float(pep) < 0.001:
pep_score[peptide] = "0.001"
else:
pep_score[peptide] = (
pep # adjustment necessary to not have 0 and 1 fuck up probability calculations
)
else:
if float(pep) < 0.001:
pep_score[peptide] = "0.001"
else:
pep_score[peptide] = min(pep, pep_score[peptide])

pep_score_psm[peptide] = {
"score": pep_score[peptide],
"psms": len(pep_psm[peptide])
}

return pep_score_psm
peptide, score = line.split("\t")
score = float(score)

# Update pep_score dictionary
pep_score[peptide] = score # Assumes the latest score in the file should be used

def parse_ms2rescore_output(pout_file: str, fdr_threshold: float) -> Dict[str, Dict[str, float | int]]:
"""
Parses the ms2rescore pout file for peptides, psm numbers and peptide scores
:param pout_file: str, content of pout files that need to be parsed.
:param fdr_threshold: float, fdr threshold below which psms are kept
:return: dict, peptides:[score,#psms]
"""

pep_score = dict()
pep_psm = dict()
pep_score_psm = dict()

# Skip header, so start from idx 1
for line in pout_file.splitlines()[1:]:
line = line.rstrip()
# skip empty lines
if line == "":
continue
splitted_line = line.split("\t")[0:8]
# assert len(splitted_line) >= 6, "Input file is wrongly formatted. Make sure that the input is a valid .pout file."
peptide, psm_id, run, collection, is_decoy, score, q, pep = (
splitted_line
)
if float(q) < fdr_threshold:
peptide = re.sub("\[.*?\]", "", peptide)
peptide = peptide.split("/")[0]
# update pep_psm
if peptide not in pep_psm.keys():
pep_psm[peptide] = set()
pep_psm[peptide].add(psm_id)
else:
pep_psm[peptide].add(psm_id)
# update pep_score
if peptide not in pep_score.keys():
if float(pep) < 0.001:
pep_score[peptide] = "0.001"
else:
pep_score[peptide] = (
pep # adjustment necessary to not have 0 and 1 fuck up probability calculations
)
else:
if float(pep) < 0.001:
pep_score[peptide] = "0.001"
else:
pep_score[peptide] = min(pep, pep_score[peptide])
pep_score_psm[peptide] = {
"score": pep_score[peptide],
"psms": len(pep_psm[peptide])
}
# Update pep_psm_counts dictionary
if peptide in pep_psm_counts:
pep_psm_counts[peptide] += 1
else:
pep_psm_counts[peptide] = 1

return pep_score_psm
return pep_score, pep_psm_counts
7 changes: 5 additions & 2 deletions peptonizer/peptonizer/request_manager.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import requests
import sys


class RequestManager:
@staticmethod
def perform_post_request(url, payload):
req = requests.Request('POST', url=url, json=payload)
prepared = req.prepare()
# This header needs to be removed since most browsers do not allow us to set it manually
del prepared.headers["Content-length"]
# This header needs to be removed since most browsers do not allow us to set it manually, this is only
# required when executing the peptonizer with Pyodide.
if sys.platform == 'emscripten':
del prepared.headers["Content-length"]
# Perform the HTTP POST request
s = requests.Session()
return s.send(prepared)
73 changes: 73 additions & 0 deletions peptonizer/peptonizer/taxa_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import networkx as nx
import pandas as pd


def get_peptides_per_taxon(graph_in: nx.Graph):
peptidome_dict = {}
for node in graph_in.nodes(data = True):
if node[1]['category'] == 'taxon' and node[0]:
neighbors = graph_in.neighbors(node[0])
peptidome_dict.update({node[0]: [n[:-4] for n in neighbors]})
return peptidome_dict


def compute_detected_peptidome_similarity(peptidome_dict):
sim_matrix_max = []
taxa1 = []
taxa2 = []
for taxon1 in peptidome_dict.keys():
taxa1.append(taxon1)
sim_matrix_max_row = []
for taxon2 in peptidome_dict.keys():
taxa2.append(taxon2)
peptides1 = set(peptidome_dict[taxon1])
peptides2 = set(peptidome_dict[taxon2])
shared = len(peptides1.intersection(peptides2))
try:
sim = shared / ( len(peptides2))
except:
sim = 0

sim_matrix_max_row.append(sim)

sim_matrix_max.append(sim_matrix_max_row)

similarity_frame = pd.DataFrame(sim_matrix_max, columns = taxa1, index = taxa1)
return similarity_frame


def cluster_taxa_based_on_similarity(
pepgm_graph: nx.Graph,
taxid_weights: pd.DataFrame,
similarity_threshold: float
):
peptidome_dict = get_peptides_per_taxon(pepgm_graph)
similarities = compute_detected_peptidome_similarity(peptidome_dict)

taxid_weights = taxid_weights.loc[taxid_weights.HigherTaxa.isin([int(x) for x in similarities.index.tolist()])]

list_of_weight_sorted_taxa = taxid_weights.HigherTaxa.tolist()
taxa_cluster_list = []

cluster_heads = [taxid_weights.HigherTaxa[0]]

while list_of_weight_sorted_taxa:
taxon1 = list_of_weight_sorted_taxa[0]
cluster_list = []
cluster_heads.append(list_of_weight_sorted_taxa[0])

for taxon2 in taxid_weights.HigherTaxa:
if similarities[str(int(taxon2))][str(int(taxon1))] > similarity_threshold:
cluster_list.append(taxon2)
if taxon2 in list_of_weight_sorted_taxa:
list_of_weight_sorted_taxa.remove(taxon2)

taxa_cluster_list.append(cluster_list)


clustered_weight_sorted_taxa = taxid_weights.loc[taxid_weights.HigherTaxa.isin(cluster_heads)]
clustered_weight_sorted_taxa2 = taxid_weights.loc[taxid_weights.HigherTaxa.isin([clustered_taxa[1:] for clustered_taxa in taxa_cluster_list])]
clustered_weight_sorted_taxa = pd.concat([clustered_weight_sorted_taxa, clustered_weight_sorted_taxa2])
clustered_weight_sorted_taxa['Clustermembers'] = taxa_cluster_list

return clustered_weight_sorted_taxa
Loading