From 723d146649fd2c482c9f5d9f4be8b2535f483598 Mon Sep 17 00:00:00 2001
From: VarunAnanth2003 <varunananth1@gmail.com>
Date: Mon, 11 Mar 2024 11:44:20 -0700
Subject: [PATCH] Fix scan reading

---
 README.md                       |  6 ++++++
 casanovo/casanovo.py            | 14 ++++++++------
 casanovo/denovo/model.py        | 21 ++++++++++++---------
 casanovo/denovo/model_runner.py |  1 +
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 5c2e847d..582145eb 100644
--- a/README.md
+++ b/README.md
@@ -8,10 +8,16 @@ You can install this branch (ideally, in an appropriately named Conda environmen
 To use Casanovo-DB, you must also install the Crux toolkit.  Given a set of spectra in a file named, for example, `spectra.mgf` and a corresponding proteome fasta `proteome.fasta`, you can run a database search via the following commands:
 1. Build a peptide index in the directory `my_proteome`:
 - `crux tide-index proteome.fasta my_proteome`
+
+Please note that your `.fasta` file cannot contain any 'U' amino acids because it is not in the vocabulary of Casanovo. Replace all occurrences of this character with 'X' to denote a missing amino acid.
+
 2. Identify candidate peptides for each spectrum (be sure to set `top-match` to a very high number):
 - `crux tide-search --output-dir search_results --top-match 1000000 spectra.mgf my_proteome`
 3. Extract the candidate peptides from the search results into a format readable by Casanovo-DB (`annotated.mgf`).
 - `casanovo --mode=annotate --peak_path spectra.mgf --tide_dir_path search_results --output annotated.mgf`
+
+Please note that `spectra.mgf` must contain the `SCANS=` field.
+
 4. Run Casanovo-DB:
 - `casanovo --mode=db --peak_path annotated.mgf --output casanovo_db_result.mztab`
 
diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
index 244150bf..edc7e494 100644
--- a/casanovo/casanovo.py
+++ b/casanovo/casanovo.py
@@ -388,12 +388,14 @@ def create_mgf_from_tide(
         scan_map[scan] = target_candidate_list + decoy_candidate_list
 
     all_spec = []
-    for idx, spec_dict in enumerate(
-        mgf.read(mgf_file)
-    ):  #! WILL NEED TO BE CHANGED FOR OTHER ENCODINGS OF SCAN
-        scan = int(
-            re.search(r"scan=(\d+)", spec_dict["params"]["title"]).group(1)
-        )
+    for idx, spec_dict in enumerate(mgf.read(mgf_file)):
+        try:
+            scan = int(spec_dict["params"]["scans"])
+        except KeyError as e:
+            logger.error(
+                "Could not find the scan number in the .mgf file. Please ensure that the .mgf file contains the scan number in the 'SCANS' field."
+            )
+            raise e
         try:
             spec_dict["params"]["seq"] = ",".join(list(scan_map[scan]))
             all_spec.append(spec_dict)
diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py
index 038dd371..6e7d47a9 100644
--- a/casanovo/denovo/model.py
+++ b/casanovo/denovo/model.py
@@ -6,6 +6,7 @@
 import csv
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import operator
+import os
 
 import depthcharge.masses
 import einops
@@ -1031,16 +1032,18 @@ def on_predict_epoch_end(self, results) -> None:
         results = np.array(results, dtype=object).squeeze((0))
         with open(self.out_writer.filename, "a") as out_f:
             csv_writer = csv.writer(out_f, delimiter="\t")
-            # Write a header
-            csv_writer.writerow(
-                (
-                    "index",
-                    "peptide",
-                    "target",
-                    "score",
-                    "per_aa_scores",
+            # Write a header IF THE FILE IS BLANK
+            if os.stat(self.out_writer.filename).st_size == 0:
+                csv_writer.writerow(
+                    (
+                        "index",
+                        "peptide",
+                        "target",
+                        "score",
+                        "per_aa_scores",
+                    )
                 )
-            )
+            # Write rows
             for group in results:
                 for batch in group:
                     for index, t_or_d, peptide, score, per_aa_scores in list(
diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py
index 96408d9e..72cd78b7 100644
--- a/casanovo/denovo/model_runner.py
+++ b/casanovo/denovo/model_runner.py
@@ -355,6 +355,7 @@ def db_search(
         n_beams=config["n_beams"],
         n_log=config["n_log"],
         out_writer=out_writer,
+        top_match=config["top_match"],
     )
     # Read the MS/MS spectra for which to predict peptide sequences.
     peak_ext = (".mgf", ".h5", ".hdf5")