From 723d146649fd2c482c9f5d9f4be8b2535f483598 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 11 Mar 2024 11:44:20 -0700 Subject: [PATCH] Fix scan reading --- README.md | 6 ++++++ casanovo/casanovo.py | 14 ++++++++------ casanovo/denovo/model.py | 21 ++++++++++++--------- casanovo/denovo/model_runner.py | 1 + 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 5c2e847d..582145eb 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,16 @@ You can install this branch (ideally, in an appropriately named Conda environmen To use Casanovo-DB, you must also install the Crux toolkit. Given a set of spectra in a file named, for example, `spectra.mgf` and a corresponding proteome fasta `proteome.fasta`, you can run a database search via the following commands: 1. Build a peptide index in the directory `my_proteome`: - `crux tide-index proteome.fasta my_proteome` + +Please note that your `.fasta` file cannot contain any 'U' amino acids because it is not in the vocabulary of Casanovo. Replace all occurrences of this character with 'X' to denote a missing amino acid. + 2. Identify candidate peptides for each spectrum (be sure to set `top-match` to a very high number): - `crux tide-search --output-dir search_results --top-match 1000000 spectra.mgf my_proteome` 3. Extract the candidate peptides from the search results into a format readable by Casanovo-DB (`annotated.mgf`). - `casanovo --mode=annotate --peak_path spectra.mgf --tide_dir_path search_results --output annotated.mgf` + +Please note that `spectra.mgf` must contain the `SCANS=` field. + 4. Run Casanovo-DB: - `casanovo --mode=db --peak_path annotated.mgf --output casanovo_db_result.mztab` diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 244150bf..edc7e494 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -388,12 +388,14 @@ def create_mgf_from_tide( scan_map[scan] = target_candidate_list + decoy_candidate_list all_spec = [] - for idx, spec_dict in enumerate( - mgf.read(mgf_file) - ): #! WILL NEED TO BE CHANGED FOR OTHER ENCODINGS OF SCAN - scan = int( - re.search(r"scan=(\d+)", spec_dict["params"]["title"]).group(1) - ) + for idx, spec_dict in enumerate(mgf.read(mgf_file)): + try: + scan = int(spec_dict["params"]["scans"]) + except KeyError as e: + logger.error( + "Could not find the scan number in the .mgf file. Please ensure that the .mgf file contains the scan number in the 'SCANS' field." + ) + raise e try: spec_dict["params"]["seq"] = ",".join(list(scan_map[scan])) all_spec.append(spec_dict) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 038dd371..6e7d47a9 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -6,6 +6,7 @@ import csv from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import operator +import os import depthcharge.masses import einops @@ -1031,16 +1032,18 @@ def on_predict_epoch_end(self, results) -> None: results = np.array(results, dtype=object).squeeze((0)) with open(self.out_writer.filename, "a") as out_f: csv_writer = csv.writer(out_f, delimiter="\t") - # Write a header - csv_writer.writerow( - ( - "index", - "peptide", - "target", - "score", - "per_aa_scores", + # Write a header IF THE FILE IS BLANK + if os.stat(self.out_writer.filename).st_size == 0: + csv_writer.writerow( + ( + "index", + "peptide", + "target", + "score", + "per_aa_scores", + ) ) - ) + # Write rows for group in results: for batch in group: for index, t_or_d, peptide, score, per_aa_scores in list( diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 96408d9e..72cd78b7 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -355,6 +355,7 @@ def db_search( n_beams=config["n_beams"], n_log=config["n_log"], out_writer=out_writer, + top_match=config["top_match"], ) # Read the MS/MS spectra for which to predict peptide sequences. peak_ext = (".mgf", ".h5", ".hdf5")