Skip to content

Commit

Permalink
working predict_library and csv output
Browse files Browse the repository at this point in the history
  • Loading branch information
rodvrees committed May 6, 2024
1 parent 683d52d commit 4cf2fde
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 54 deletions.
15 changes: 14 additions & 1 deletion ms2pip/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,21 @@ def predict_batch(*args, **kwargs):


@cli.command(help=ms2pip.core.predict_library.__doc__)
@click.argument("proteome", required=True, type=click.Path(exists=True))
@click.option("--output-name", "-o", type=str)
@click.option("--processes", "-n", type=int)
def predict_library(*args, **kwargs):
ms2pip.core.predict_library(*args, **kwargs)
output_name = kwargs.pop("output_name")
output_name = _infer_output_name(kwargs["proteome"], output_name)
output_name_csv = output_name.with_name(output_name.stem + "_predictions").with_suffix(".csv")

predictions = ms2pip.core.predict_library(*args, **kwargs)
# Combine predictions into a single list
predictions = [pred for sublist in predictions for pred in sublist]
logger.info(f'Writing output to {output_name_csv}')
results_to_csv(predictions, output_name_csv)
logger.info(f'Finished writing output to {output_name_csv}')



@cli.command(help=ms2pip.core.correlate.__doc__)
Expand Down
6 changes: 5 additions & 1 deletion ms2pip/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ def predict_batch(
Predicted spectra with theoretical m/z and predicted intensity values.
"""
if isinstance(psms, list):
psms = PSMList(psm_list=psms)
psm_list = read_psms(psms, filetype=psm_filetype)

if add_retention_time:
Expand Down Expand Up @@ -137,7 +139,7 @@ def predict_library(
----------
proteome
ProteomeSearchSpace, or a dictionary or path to JSON file with proteome search space
paramters.
parameters.
add_retention_time
Add retention time predictions with DeepLC (Requires optional DeepLC dependency).
model
Expand Down Expand Up @@ -939,4 +941,6 @@ def _assemble_training_data(results: List[ProcessingResult], model: str) -> pd.D

def _into_batches(items: List[Any], batch_size: int) -> List[List[Any]]:
"""Divide list of items into batches for batch-based processing."""
if isinstance(items, itertools.chain):
items = list(items)
return [items[i : i + batch_size] for i in range(0, len(items), batch_size)]
4 changes: 4 additions & 0 deletions ms2pip/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import csv
from typing import Any, Dict, List, Optional, Tuple
from logging import getLogger

import numpy as np
from psm_utils import PSM
Expand All @@ -17,6 +18,7 @@

from ms2pip.spectrum import ObservedSpectrum, PredictedSpectrum

logger = getLogger(__name__)

class ProcessingResult(BaseModel):
"""Result of processing a single PSM."""
Expand Down Expand Up @@ -120,6 +122,7 @@ def results_to_csv(results: List["ProcessingResult"], output_file: str) -> None:
with open(output_file, "wt") as f:
fieldnames = [
"psm_index",
"peptidoform",
"ion_type",
"ion_number",
"mz",
Expand All @@ -135,6 +138,7 @@ def results_to_csv(results: List["ProcessingResult"], output_file: str) -> None:
writer.writerow(
{
"psm_index": result.psm_index,
"peptidoform": result.psm.peptidoform,
"ion_type": ion_type,
"ion_number": i + 1,
"mz": "{:.6g}".format(result.theoretical_mz[ion_type][i]),
Expand Down
134 changes: 82 additions & 52 deletions ms2pip/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,24 @@
import multiprocessing.dummy
from collections import defaultdict
from functools import cmp_to_key, partial

Check failure on line 8 in ms2pip/search_space.py

View workflow job for this annotation

GitHub Actions / build (3.9)

Ruff (F401)

ms2pip/search_space.py:8:23: F401 `functools.cmp_to_key` imported but unused

Check failure on line 8 in ms2pip/search_space.py

View workflow job for this annotation

GitHub Actions / build (3.11)

Ruff (F401)

ms2pip/search_space.py:8:23: F401 `functools.cmp_to_key` imported but unused
from itertools import chain, product
from itertools import chain, product, combinations
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

import pyteomics.fasta
from psm_utils import PSMList
from psm_utils import PSMList, PSM
from pydantic import BaseModel, field_validator, model_validator
from pyteomics.parser import icleave
from rich.progress import track
from logging import getLogger
import numpy as np

logger = getLogger(__name__)

class ModificationConfig(BaseModel):
"""Configuration for a single modification in the search space."""

label: str
name: str
amino_acid: Optional[str] = None
peptide_n_term: Optional[bool] = False
protein_n_term: Optional[bool] = False
Expand Down Expand Up @@ -53,15 +56,44 @@ class PeptidoformSearchSpace(BaseModel):
modification_options: List[Dict[int, ModificationConfig]] = None
charge_options: List[int] = None

# TODO
def into_psm_list(self) -> PSMList:
"""Convert PeptidoformSpace to PSMList with given charge and modification."""
def into_psm_list(self, min_precursor_mz = 0, max_precursor_mz = np.Inf) -> PSMList:
"""Convert PeptidoformSearchSpace to PSMList with given charge and modification."""
if not self.charge_options:
raise ValueError("Peptide charge options not defined.")
if not self.modification_options:
raise ValueError("Peptide modification options not defined.")

raise NotImplementedError("Method not implemented yet.")
psms = []
# Maybe should be a global variable? Because now resets for every peptide
spectrum_id = 0
for modifications, charge in product(self.modification_options, self.charge_options):
offset = 0
if not modifications:
psm = PSM(peptidoform=(self.sequence+'/{}'.format(charge)), spectrum_id=spectrum_id)
spectrum_id += 1
else:
modded_sequence = list(self.sequence)
for position, mod in modifications.items():

if position != 0 and position != -1:
modded_sequence.insert(position+offset, f"[{mod}]")
offset += 1

elif position == 0:
modded_sequence.insert(0, f"[{mod}]-")
offset += 1
elif position == -1:
modded_sequence.append(f"-[{mod}]")

modded_sequence = "".join(modded_sequence)
psm = PSM(peptidoform=(modded_sequence+'/{}'.format(charge)), spectrum_id=spectrum_id)
spectrum_id += 1
if psm.peptidoform.theoretical_mz >= min_precursor_mz and psm.peptidoform.theoretical_mz <= max_precursor_mz:
psms.append(psm)
psm_list = PSMList(psm_list = psms)
return psm_list




DEFAULT_MODIFICATIONS = [
Expand All @@ -87,8 +119,8 @@ class ProteomeSearchSpace(BaseModel):
fasta_file: Path
min_length: int = 8
max_length: int = 30
min_precursor_mz: Optional[float] = None
max_precursor_mz: Optional[float] = None
min_precursor_mz: Optional[float] = 0
max_precursor_mz: Optional[float] = np.Inf
cleavage_rule: str = "trypsin"
missed_cleavages: int = 2
semi_specific: bool = False
Expand Down Expand Up @@ -141,8 +173,9 @@ def generate_psms(self, processes=1) -> PSMList:
if not self._peptidoform_space:
self.build_search_space(processes)

# TODO Implement filtering by precursor m/z on PSMs
return chain.from_iterable([psm.into_psm_list() for psm in self._peptidoform_space])
unfiltered_search_space = chain.from_iterable([psm.into_psm_list(self.min_precursor_mz, self.max_precursor_mz) for psm in self._peptidoform_space])

return unfiltered_search_space

def build_search_space(self, processes=1):
"""Build peptide search space from FASTA file."""
Expand All @@ -153,6 +186,8 @@ def build_search_space(self, processes=1):

def digest_fasta(self, processes=1):
"""Digest FASTA file to peptides and populate search space."""
# Convert to string to avoid issues with Path objects
self.fasta_file = str(self.fasta_file)
n_proteins = _count_fasta_entries(self.fasta_file)
if self.add_decoys:
fasta_db = pyteomics.fasta.decoy_db(
Expand Down Expand Up @@ -230,11 +265,6 @@ def add_charges(self):
):
peptide.charge_options = self.charges

# TODO
def filter_precursor_mz(self, processes=1):
"""Filter peptides based on precursor m/z."""
raise NotImplementedError()


def _digest_single_protein(
protein: pyteomics.fasta.Protein,
Expand Down Expand Up @@ -294,11 +324,11 @@ def _restructure_modifications_by_target(
) -> Dict[str, Dict[str, List[ModificationConfig]]]:
"""Restructure variable modifications to options per side chain or terminus."""
modifications_by_target = {
"sidechain": defaultdict(lambda: [None]),
"peptide_n_term": defaultdict(lambda: [None]),
"peptide_c_term": defaultdict(lambda: [None]),
"protein_n_term": defaultdict(lambda: [None]),
"protein_c_term": defaultdict(lambda: [None]),
"sidechain": defaultdict(lambda: []),
"peptide_n_term": defaultdict(lambda: []),
"peptide_c_term": defaultdict(lambda: []),
"protein_n_term": defaultdict(lambda: []),
"protein_c_term": defaultdict(lambda: []),
}

def add_mod(mod, target, amino_acid):
Expand Down Expand Up @@ -352,6 +382,34 @@ def _get_modification_versions(
[{}, {3: 'Phospho'}, {0: 'Acetyl'}, {0: 'Acetyl', 3: 'Phospho'}]
"""
def _get_combinations(possibilities_by_site, max_variable_modifications):
# Prepare dictionaries for fixed and variable modifications
fixed_modifications = {}
variable_sites = []

# Separate fixed and variable modification sites
for site, mods in possibilities_by_site.items():
for mod in mods:
if mod.fixed:
fixed_modifications[site] = mod.name
else:
variable_sites.append((site, mod.name))

# If no fixed modifications, add empty dictionary
if not fixed_modifications:
fixed_modifications = {}

# Generate all combinations of variable modifications up to the maximum allowed
valid_combinations = []
for i in range(max_variable_modifications + 1):
for comb in combinations(variable_sites, i):
combination_dict = fixed_modifications.copy()
for site, mod_name in comb:
combination_dict[site] = mod_name
valid_combinations.append(combination_dict)

return valid_combinations

possibilities_by_site = defaultdict(list)

# Generate dictionary of positions per amino acid
Expand Down Expand Up @@ -400,40 +458,12 @@ def _get_modification_versions(
for pos in pos_dict[aa]:
possibilities_by_site[pos] = [mod]

# Get all possible combinations of modifications for all sites
mod_permutations = product(*possibilities_by_site.values())
mod_positions = possibilities_by_site.keys()

# Filter by max modified sites (avoiding combinatorial explosion)
mod_permutations = filter(
lambda mods: sum([1 for m in mods if m is not None and not m.fixed])
<= max_variable_modifications,
mod_permutations,
)

def _compare_minus_one_larger(a, b):
"""Custom comparison function where `-1` is always larger."""
if a[0] == -1:
return 1
elif b[0] == -1:
return -1
else:
return a[0] - b[0]

# Get MS²PIP modifications strings for each combination
mod_strings = []
for p in mod_permutations:
if p == [""]:
mod_strings.append("-")
else:
mods = sorted(zip(mod_positions, p), key=cmp_to_key(_compare_minus_one_larger))
mod_strings.append("|".join(f"{p}|{m.name}" for p, m in mods if m))

return mod_strings

modification_versions = _get_combinations(possibilities_by_site, max_variable_modifications)
return modification_versions

def _get_pool(processes: int) -> Union[multiprocessing.Pool, multiprocessing.dummy.Pool]:
"""Get a multiprocessing pool with the given number of processes."""
# TODO: fix None default value for processes
if processes > 1:
return multiprocessing.Pool(processes)
else:
Expand Down

0 comments on commit 4cf2fde

Please sign in to comment.