Skip to content

Commit

Permalink
working casanovo_db implementation (rough)
Browse files Browse the repository at this point in the history
  • Loading branch information
VarunAnanth2003 committed Mar 11, 2024
1 parent 4e8aaee commit 7813d7f
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 7 deletions.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,23 @@
# Casanovo

This branch of the Casanovo project contains code that implements the Casanovo-DB database search procedure. The preprint version of the paper can be found [here](https://www.biorxiv.org/content/10.1101/2024.01.26.577425v2). Our eventual goal is to provide the full database search functionality as part of Casanovo. For now, however, this branch allows for testing of the methodology by making use of some important functionality available in the Crux mass spectrometry toolkit (http://crux.ms).
You can install this branch (ideally, in an appropriately named Conda environment) using the following command:
```
pip install git+https://github.com/Noble-Lab/casanovo.git@db_search
```
To use Casanovo-DB, you must also install the Crux toolkit. Given a set of spectra in a file named, for example, `spectra.mgf` and a corresponding proteome fasta `proteome.fasta`, you can run a database search via the following commands:
1. Build a peptide index in the directory `my_proteome`:
- `crux tide-index proteome.fasta my_proteome`
2. Identify candidate peptides for each spectrum (be sure to set `top-match` to a very high number):
- `crux tide-search --output-dir search_results --top-match 1000000 spectra.mgf my_proteome`
3. Extract the candidate peptides from the search results into a format readable by Casanovo-DB (`annotated.mgf`).
- `casanovo --mode=annotate --peak_path spectra.mgf --tide_dir_path search_results --output annotated.mgf`
4. Run Casanovo-DB:
- `casanovo --mode=db --peak_path annotated.mgf --output casanovo_db_result.mztab`


The resulting file is in mztab format, similar to that produced by Casanovo's `sequence` command, except that there are scores for every candidate peptide against their respective spectrum (pairs as specified in `annotated.mgf`).

**_De Novo_ Mass Spectrometry Peptide Sequencing with a Transformer Model**

![image](https://user-images.githubusercontent.com/32707537/152622912-ca87da20-a64c-4e3f-9ca1-721c6b0d9c64.png)
Expand Down
136 changes: 133 additions & 3 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import torch
import tqdm
import yaml
import pandas as pd
import pyteomics.mgf as mgf
from pytorch_lightning.lite import LightningLite

from . import __version__
Expand All @@ -39,8 +41,12 @@
'- "train" will train a model (from scratch or by\ncontinuing training a '
"previously trained model).\n"
'- "eval" will evaluate the performance of a\ntrained model using '
"previously acquired spectrum\nannotations.",
type=click.Choice(["denovo", "train", "eval", "db"]),
"previously acquired spectrum\nannotations.\n"
'- "db" will use Casanovo-DB to score\nspectra against a database of\n'
"candidates specified in an .mgf\ncreated with annotate mode.\n"
'- "annotate" will use tide-search results\nto annotate an .mgf '
"file with candidate peptides.",
type=click.Choice(["denovo", "train", "eval", "db", "annotate"]),
)
@click.option(
"--model",
Expand All @@ -51,7 +57,9 @@
"--peak_path",
required=True,
help="The file path with peak files for predicting peptide sequences or "
"training Casanovo.",
"training Casanovo. If mode is 'db', this should be the path to the "
"annotated .mgf file. If mode is 'annotate', this should be the path to the "
".mgf file you wish to annotate.",
)
@click.option(
"--peak_path_val",
Expand All @@ -70,13 +78,20 @@
"(optionally) prediction results (extension: .mztab).",
type=click.Path(dir_okay=False),
)
@click.option(
"--tide_dir_path",
help="The directory containing the results of a successful tide search. "
"Used in annotate mode to annotate an .mgf file with candidate peptides.",
type=click.Path(exists=True, file_okay=False),
)
def main(
mode: str,
model: Optional[str],
peak_path: str,
peak_path_val: Optional[str],
config: Optional[str],
output: Optional[str],
tide_dir_path: Optional[str],
):
"""
\b
Expand Down Expand Up @@ -155,6 +170,7 @@ def main(
logger.debug("peak_path_val = %s", peak_path_val)
logger.debug("config = %s", config.file)
logger.debug("output = %s", output)
logger.debug("tide_dir_path = %s", tide_dir_path)
for key, value in config.items():
logger.debug("%s = %s", str(key), str(value))

Expand All @@ -175,6 +191,9 @@ def main(
logger.info("Database seach with casanovo.")
writer = ms_io.MztabWriter(f"{output}.mztab")
model_runner.db_search(peak_path, model, config, writer)
elif mode == "annotate":
logger.info("Annotate .mgf file with candidate peptides.")
create_mgf_from_tide(tide_dir_path, peak_path, output)


def _get_model_weights() -> str:
Expand Down Expand Up @@ -280,5 +299,116 @@ def _get_model_weights() -> str:
)


def _normalize_mods(seq: str) -> str:
"""
Turns tide-style modifications into the format used by Casanovo-DB.
Parameters
----------
seq : str
The peptide sequence with tide-style modifications.
Returns
-------
str
The peptide sequence with Casanovo-DB-style modifications.
"""
seq = seq.replace("M[15.99]", "M+15.995")
seq = seq.replace("C", "C+57.021")
seq = seq.replace("N[0.98]", "N+0.984")
seq = seq.replace("Q[0.98]", "Q+0.984")
seq = re.sub(r"(.*)\[42\.01\]", r"+42.011\1", seq)
seq = re.sub(r"(.*)\[43\.01\]", r"+43.006\1", seq)
seq = re.sub(r"(.*)\[\-17\.03\]", r"-17.027\1", seq)
seq = re.sub(r"(.*)\[25\.98\]", r"+43.006-17.027\1", seq)
return seq


def create_mgf_from_tide(
tide_dir_path: str, mgf_file: str, output_file: str
) -> None:
"""
Accepts a directory containing the results of a successful tide search, and an .mgf file containing MS/MS spectra.
The .mgf file is then annotated in the SEQ field with all of the candidate peptides for each spectrum, as well as their target/decoy status.
This annotated .mgf can be given directly to Casanovo-DB to perfrom a database search.
Parameters
----------
tide_dir_path : str
Path to the directory containing the results of a successful tide search.
mgf_file : str
Path to the .mgf file containing MS/MS spectra.
output_file : str
Path to where the annotated .mgf will be written.
"""
# Get paths to tide search text files
tdf_path = os.path.join(tide_dir_path, "tide-search.target.txt")
ddf_path = os.path.join(tide_dir_path, "tide-search.decoy.txt")
try:
target_df = pd.read_csv(
tdf_path, sep="\t", usecols=["scan", "sequence", "target/decoy"]
)
decoy_df = pd.read_csv(
ddf_path, sep="\t", usecols=["scan", "sequence", "target/decoy"]
)
except FileNotFoundError as e:
logger.error(
"Could not find tide search results in the specified directory. "
"Please ensure that the directory contains the following files: "
"tide-search.target.txt and tide-search.decoy.txt"
)
raise e

logger.info(
"Successfully read tide search results from %s.", tide_dir_path
)

df = pd.concat([target_df, decoy_df])
scan_groups = df.groupby("scan")[["sequence", "target/decoy"]]

scan_map = {}

for scan, item in scan_groups:
target_candidate_list = list(
map(
_normalize_mods,
item.groupby("target/decoy")["sequence"].apply(list)["target"],
)
)
decoy_candidate_list = list(
map(
_normalize_mods,
item.groupby("target/decoy")["sequence"].apply(list)["decoy"],
)
)
decoy_candidate_list = list(
map(lambda x: "decoy_" + str(x), decoy_candidate_list)
)
scan_map[scan] = target_candidate_list + decoy_candidate_list

all_spec = []
for idx, spec_dict in enumerate(
mgf.read(mgf_file)
): #! WILL NEED TO BE CHANGED FOR OTHER ENCODINGS OF SCAN
scan = int(
re.search(r"scan=(\d+)", spec_dict["params"]["title"]).group(1)
)
try:
spec_dict["params"]["seq"] = ",".join(list(scan_map[scan]))
all_spec.append(spec_dict)
except KeyError as e:
# No need to do anything if the scan is not found in the scan map
pass
try:
mgf.write(all_spec, output_file)
logger.info("Annotated .mgf file written to %s.", output_file)
except Exception as e:
print(
f"Write to {output_file} failed. Check if the file path is correct."
)
print(e)


if __name__ == "__main__":
main()
20 changes: 16 additions & 4 deletions casanovo/denovo/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -945,7 +945,8 @@ class DBSpec2Pep(Spec2Pep):
"""
Inherits Spec2Pep
Hijacks teacher-forcing implemented in Spec2Pep and uses it to predict scores between a spectra and associated peptide
Hijacks teacher-forcing implemented in Spec2Pep and uses it to predict scores between a spectra and associated peptide.
Input format is .mgf, with comma-separated targets and decoys in the SEQ field. Decoys should have a prefix of "decoy_".
"""

num_pairs = 1024
Expand Down Expand Up @@ -1009,7 +1010,7 @@ def smart_batch_gen(self, batch):
)
)
)
# continually grab n items from all_psm until list is exhausted
# continually grab num_pairs items from all_psm until list is exhausted
while len(all_psm) > 0:
batch = all_psm[: self.num_pairs]
all_psm = all_psm[self.num_pairs :]
Expand All @@ -1029,7 +1030,17 @@ def on_predict_epoch_end(self, results) -> None:
return
results = np.array(results, dtype=object).squeeze((0))
with open(self.out_writer.filename, "a") as out_f:
csv_writer = csv.writer(out_f)
csv_writer = csv.writer(out_f, delimiter="\t")
# Write a header
csv_writer.writerow(
(
"index",
"peptide",
"target",
"score",
"per_aa_scores",
)
)
for group in results:
for batch in group:
for index, t_or_d, peptide, score, per_aa_scores in list(
Expand All @@ -1056,7 +1067,8 @@ def calc_match_score(
) -> List[float]:
"""
Take in teacher-forced scoring of amino acids of the peptides (in a batch) and use the truth labels
to calculate a score between the input spectra and associated peptide.
to calculate a score between the input spectra and associated peptide. The score is the geometric
mean of the AA probabilities
Parameters
----------
Expand Down

0 comments on commit 7813d7f

Please sign in to comment.