unipept · pverscha · Oct 29, 2024 · Oct 30, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/peptonizer/peptonizer/__init__.py b/peptonizer/peptonizer/__init__.py
@@ -1,7 +1,9 @@
 from .zero_lookahead_belief_propagation import run_belief_propagation, ZeroLookaheadProgressListener
 from .plot_results import plot_peptonizer_results
-from .parsers import parse_pout, parse_ms2rescore_output
+from .parsers import parse_peptide_tsv
 from .unipept_get_taxonomy_from_pout import fetch_unipept_taxon_information
 from .weight_taxa import perform_taxa_weighing
 from .factor_graph_generation import generate_pepgm_graph
 from .extract_taxon_scores import extract_taxon_scores
+from .analyse_grid_search import find_best_parameters, ParameterSet
+from .taxa_clustering import cluster_taxa_based_on_similarity
diff --git a/peptonizer/peptonizer/analyse_grid_search.py b/peptonizer/peptonizer/analyse_grid_search.py
@@ -0,0 +1,58 @@
+import pandas as pd
+import rbo
+
+from scipy.stats import entropy
+from typing import List, NamedTuple, Tuple
+
+
+class ParameterSet(NamedTuple):
+    """
+    Represents a set of parameters that have been used for a Peptonizer analysis to tweak the behaviour of the belief
+    propagation algorithm.
+    """
+    alpha: float
+    beta: float
+    prior: float
+
+
+def compute_goodness(df: pd.DataFrame, taxid_weights: pd.DataFrame):
+    tax_ids = df.loc[df['type'] == 'taxon'].copy()
+    tax_ids.loc[:, 'score'] = pd.to_numeric(tax_ids['score'], downcast = 'float')
+    tax_ids.sort_values('score', ascending = False, inplace = True)
+    tax_ids.dropna(inplace = True)
+
+    # Compute entropy of the posterior probability distribution.
+    computed_entropy = entropy(tax_ids['score'].tolist())
+
+    # Compute the rank based similarity between the weight sorted taxa and the score sorted ID results.
+
+    return rbo.RankingSimilarity(
+        taxid_weights['HigherTaxa'].values,
+        [int(i) for i in tax_ids['ID'].values]
+    ).rbo() * (1 / computed_entropy ** 2)
+
+
+def find_best_parameters(results: List[Tuple[pd.DataFrame, ParameterSet]], taxid_weights: pd.DataFrame):
+    """
+    Given the dataframes that have been run through the Belief Propagation Algorithm before and the matching parameter
+    sets, compute a goodness metric for each of these dataframes and returns the ParameterSet that resulted in the
+    highest goodness value.
+
+    :param results: A list of tuples each holding two things:
+        1. A dataframe containing taxa and their associated scores after running the belief propagation algorithm
+        2. The parameter values that where used during the belief propagation algorithm for this set of taxa
+    :param taxid_weights: A dataframe containing taxa and their corresponding 'scaled weights', as computed by the]
+    weight_taxa step.
+    """
+    params = []
+    goodness_list = []
+
+    for df, param_set in results:
+        goodness_list.append(compute_goodness(df, taxid_weights))
+        params.append(param_set)
+
+    metrics_params = zip(goodness_list, params)
+    sorted_metric_param_pairs = sorted(metrics_params, reverse = True)
+
+    # Return the ParameterSet that's associated with the highest computed goodness metric.
+    return sorted_metric_param_pairs[0][1]
diff --git a/peptonizer/peptonizer/factor_graph_generation.py b/peptonizer/peptonizer/factor_graph_generation.py
@@ -27,8 +27,8 @@ def create_from_taxa_weights(self, taxa_weights):
             lambda row: (
                 row["sequence"],
                 {
-                    "InitialBelief_0": float(row["score"]),
-                    "InitialBelief_1": 1 - float(row["score"]),
+                    "InitialBelief_0": 1 - float(row["score"]),
+                    "InitialBelief_1": float(row["score"]),
                     "category": "peptide",
                 },
             ),
@@ -42,14 +42,7 @@ def create_from_taxa_weights(self, taxa_weights):
         intermediate_graph.add_nodes_from(peptide_attributes)
         intermediate_graph.add_nodes_from(taxa_attributes)
 
-        # cluster the resulting graph with the louvain algorithm
-        communities = nx.community.louvain_communities(intermediate_graph, seed=12345)
-
-        # separate the graph into its communities and enter into same graph object
-        for i, community in enumerate(communities):
-            subgraph = intermediate_graph.subgraph(community)
-            self.add_edges_from(subgraph.edges)
-
+        self.add_edges_from(intermediate_graph.edges)
         self.add_nodes_from(peptide_attributes)
         self.add_nodes_from(taxa_attributes)
 
@@ -260,23 +253,11 @@ def fill_in_priors(self, prior):
                     {node[0]: {"InitialBelief_0": 1 - prior, "InitialBelief_1": prior}},
                 )
 
-
-def generate_ct_factor_graphs(list_of_factor_graphs, graph_type="Taxons"):
-    if type(list_of_factor_graphs) is not list:
-        list_of_factor_graphs = [list_of_factor_graphs]
-    for graph in list_of_factor_graphs:
-        ct_factor_graph = CTFactorGraph(
-            graph, graph_type
-        )  # ListOfCTFactorGraphs.append(CTFactorGraph(Graph,GraphType))
-    # TODO Tanja: shouldn't this return all factor graphs???
-    return ct_factor_graph
-
-
 def generate_pepgm_graph(
     taxa_weights_data_frame: pd.DataFrame
 ) -> CTFactorGraph:
     taxon_graph = TaxonGraph()
     taxon_graph.create_from_taxa_weights(taxa_weights_data_frame)
     factor_graph = FactorGraph()
     factor_graph.construct_from_existing_graph(taxon_graph)
-    return generate_ct_factor_graphs(factor_graph)
+    return CTFactorGraph(factor_graph, "Taxons")
diff --git a/peptonizer/peptonizer/parsers.py b/peptonizer/peptonizer/parsers.py
@@ -1,114 +1,32 @@
-import re
+from typing import Dict, Tuple
 
-from typing import Dict, Tuple, List
-
-
-def parse_pout(pout_file: str, fdr_threshold: float) -> Dict[str, Dict[str, float | int]]:
+def parse_peptide_tsv(tsv_content: str) -> Tuple[Dict[str, float], Dict[str, int]]:
     """
-      Parses the ms2rescore pout file for peptides, psm numbers and peptide scores.
-
-      Note: this code was adapted from the Pout2Prot tool.
+    Parses a TSV string with columns 'peptide' and 'score', where peptides can occur multiple times.
 
-      :param pout_file: str, content of pout files that needs to be parsed. Note: these should not be paths to pout
-      files, but the contents of these files already!
-      :param fdr_threshold: float, FDR threshold below which psms are kept
-      :return: dict, peptides:[score,#psms]
-      """
-
-    pep_score = dict()
-    pep_psm = dict()
-    pep_score_psm = dict()
+    :param tsv_content: str, content of a TSV file with 'peptide' and 'score' columns.
+    :return: Tuple containing two dictionaries:
+        - pep_score: Maps peptide sequence to score (float).
+        - pep_psm_counts: Maps peptide sequence to the count of occurrences.
+    """
+    pep_score: Dict[str, float] = {}
+    pep_psm_counts: Dict[str, int] = {}
 
-    # Skip header, so start from idx 1
-    for line in pout_file.splitlines()[1:]:
-        line = line.rstrip()
-        # skip empty lines
-        if line == "":
+    # Process each line in the TSV content, skipping the header
+    for line in tsv_content.splitlines()[1:]:
+        if line.strip() == "":  # Skip empty lines
             continue
-        splitted_line = line.split("\t", maxsplit=5)
-        assert (
-                len(splitted_line) >= 6
-        ), "Input file is wrongly formatted. Make sure that the input is a valid .pout file."
-        psm_id, _, q, pep, peptide, _ = splitted_line
-        if float(q) < fdr_threshold:
-            peptide = re.sub("\[.*?\]", "", peptide)
-            peptide = peptide.split(".")[1]
-            # update pep_psm
-            if peptide not in pep_psm.keys():
-                pep_psm[peptide] = set()
-                pep_psm[peptide].add(psm_id)
-            else:
-                pep_psm[peptide].add(psm_id)
-            # update pep_score
-            if peptide not in pep_score.keys():
-                if float(pep) < 0.001:
-                    pep_score[peptide] = "0.001"
-                else:
-                    pep_score[peptide] = (
-                        pep  # adjustment necessary to not have 0 and 1 fuck up probability calculations
-                    )
-            else:
-                if float(pep) < 0.001:
-                    pep_score[peptide] = "0.001"
-                else:
-                    pep_score[peptide] = min(pep, pep_score[peptide])
-
-            pep_score_psm[peptide] = {
-                "score": pep_score[peptide],
-                "psms": len(pep_psm[peptide])
-            }
 
-    return pep_score_psm
+        peptide, score = line.split("\t")
+        score = float(score)
 
+        # Update pep_score dictionary
+        pep_score[peptide] = score  # Assumes the latest score in the file should be used
 
-def parse_ms2rescore_output(pout_file: str, fdr_threshold: float) -> Dict[str, Dict[str, float | int]]:
-    """
-    Parses the ms2rescore pout file for peptides, psm numbers and peptide scores
-    :param pout_file: str, content of pout files that need to be parsed.
-    :param fdr_threshold: float, fdr threshold below which psms are kept
-    :return: dict, peptides:[score,#psms]
-    """
-
-    pep_score = dict()
-    pep_psm = dict()
-    pep_score_psm = dict()
-
-    # Skip header, so start from idx 1
-    for line in pout_file.splitlines()[1:]:
-        line = line.rstrip()
-        # skip empty lines
-        if line == "":
-            continue
-        splitted_line = line.split("\t")[0:8]
-        # assert len(splitted_line) >= 6, "Input file is wrongly formatted. Make sure that the input is a valid .pout file."
-        peptide, psm_id, run, collection, is_decoy, score, q, pep = (
-            splitted_line
-        )
-        if float(q) < fdr_threshold:
-            peptide = re.sub("\[.*?\]", "", peptide)
-            peptide = peptide.split("/")[0]
-            # update pep_psm
-            if peptide not in pep_psm.keys():
-                pep_psm[peptide] = set()
-                pep_psm[peptide].add(psm_id)
-            else:
-                pep_psm[peptide].add(psm_id)
-            # update pep_score
-            if peptide not in pep_score.keys():
-                if float(pep) < 0.001:
-                    pep_score[peptide] = "0.001"
-                else:
-                    pep_score[peptide] = (
-                        pep  # adjustment necessary to not have 0 and 1 fuck up probability calculations
-                    )
-            else:
-                if float(pep) < 0.001:
-                    pep_score[peptide] = "0.001"
-                else:
-                    pep_score[peptide] = min(pep, pep_score[peptide])
-            pep_score_psm[peptide] = {
-                "score": pep_score[peptide],
-                "psms": len(pep_psm[peptide])
-            }
+        # Update pep_psm_counts dictionary
+        if peptide in pep_psm_counts:
+            pep_psm_counts[peptide] += 1
+        else:
+            pep_psm_counts[peptide] = 1
 
-    return pep_score_psm
+    return pep_score, pep_psm_counts
diff --git a/peptonizer/peptonizer/request_manager.py b/peptonizer/peptonizer/request_manager.py
@@ -1,13 +1,16 @@
 import requests
+import sys
 
 
 class RequestManager:
     @staticmethod
     def perform_post_request(url, payload):
         req = requests.Request('POST', url=url, json=payload)
         prepared = req.prepare()
-        # This header needs to be removed since most browsers do not allow us to set it manually
-        del prepared.headers["Content-length"]
+        # This header needs to be removed since most browsers do not allow us to set it manually, this is only
+        # required when executing the peptonizer with Pyodide.
+        if sys.platform == 'emscripten':
+            del prepared.headers["Content-length"]
         # Perform the HTTP POST request
         s = requests.Session()
         return s.send(prepared)
diff --git a/peptonizer/peptonizer/taxa_clustering.py b/peptonizer/peptonizer/taxa_clustering.py
@@ -0,0 +1,73 @@
+import networkx as nx
+import pandas as pd
+
+
+def get_peptides_per_taxon(graph_in: nx.Graph):
+    peptidome_dict = {}
+    for node in graph_in.nodes(data = True):
+        if node[1]['category'] == 'taxon' and node[0]:
+            neighbors = graph_in.neighbors(node[0])
+            peptidome_dict.update({node[0]: [n[:-4] for n in neighbors]})
+    return peptidome_dict
+
+
+def compute_detected_peptidome_similarity(peptidome_dict):
+    sim_matrix_max = []
+    taxa1 = []
+    taxa2 = []
+    for taxon1 in peptidome_dict.keys():
+        taxa1.append(taxon1)
+        sim_matrix_max_row = []
+        for taxon2 in peptidome_dict.keys():
+            taxa2.append(taxon2)
+            peptides1 = set(peptidome_dict[taxon1])
+            peptides2 = set(peptidome_dict[taxon2])
+            shared = len(peptides1.intersection(peptides2))
+            try:
+                sim = shared / ( len(peptides2))
+            except:
+                sim = 0
+
+            sim_matrix_max_row.append(sim)
+
+        sim_matrix_max.append(sim_matrix_max_row)
+
+    similarity_frame = pd.DataFrame(sim_matrix_max, columns = taxa1, index = taxa1)
+    return similarity_frame
+
+
+def cluster_taxa_based_on_similarity(
+        pepgm_graph: nx.Graph,
+        taxid_weights: pd.DataFrame,
+        similarity_threshold: float
+):
+    peptidome_dict = get_peptides_per_taxon(pepgm_graph)
+    similarities = compute_detected_peptidome_similarity(peptidome_dict)
+
+    taxid_weights = taxid_weights.loc[taxid_weights.HigherTaxa.isin([int(x) for x in similarities.index.tolist()])]
+
+    list_of_weight_sorted_taxa = taxid_weights.HigherTaxa.tolist()
+    taxa_cluster_list = []
+
+    cluster_heads = [taxid_weights.HigherTaxa[0]]
+
+    while list_of_weight_sorted_taxa:
+        taxon1 = list_of_weight_sorted_taxa[0]
+        cluster_list = []
+        cluster_heads.append(list_of_weight_sorted_taxa[0])
+
+        for taxon2 in taxid_weights.HigherTaxa:
+            if similarities[str(int(taxon2))][str(int(taxon1))] > similarity_threshold:
+                cluster_list.append(taxon2)
+                if taxon2 in list_of_weight_sorted_taxa:
+                    list_of_weight_sorted_taxa.remove(taxon2)
+
+        taxa_cluster_list.append(cluster_list)
+
+
+    clustered_weight_sorted_taxa = taxid_weights.loc[taxid_weights.HigherTaxa.isin(cluster_heads)]
+    clustered_weight_sorted_taxa2 = taxid_weights.loc[taxid_weights.HigherTaxa.isin([clustered_taxa[1:] for clustered_taxa in taxa_cluster_list])]
+    clustered_weight_sorted_taxa = pd.concat([clustered_weight_sorted_taxa, clustered_weight_sorted_taxa2])
+    clustered_weight_sorted_taxa['Clustermembers'] = taxa_cluster_list
+
+    return clustered_weight_sorted_taxa