working casanovo_db implementation (rough)

Noble-Lab · Mar 11, 2024 · 7813d7f · 7813d7f
1 parent 4e8aaee
commit 7813d7f
Show file tree

Hide file tree

Showing 3 changed files with 167 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,23 @@
 # Casanovo
 
+This branch of the Casanovo project contains code that implements the Casanovo-DB database search procedure. The preprint version of the paper can be found [here](https://www.biorxiv.org/content/10.1101/2024.01.26.577425v2). Our eventual goal is to provide the full database search functionality as part of Casanovo.  For now, however, this branch allows for testing of the methodology by making use of some important functionality available in the Crux mass spectrometry toolkit (http://crux.ms).
+You can install this branch (ideally, in an appropriately named Conda environment) using the following command:
+```
+  pip install git+https://github.com/Noble-Lab/casanovo.git@db_search
+```
+To use Casanovo-DB, you must also install the Crux toolkit.  Given a set of spectra in a file named, for example, `spectra.mgf` and a corresponding proteome fasta `proteome.fasta`, you can run a database search via the following commands:
+1. Build a peptide index in the directory `my_proteome`:
+- `crux tide-index proteome.fasta my_proteome`
+2. Identify candidate peptides for each spectrum (be sure to set `top-match` to a very high number):
+- `crux tide-search --output-dir search_results --top-match 1000000 spectra.mgf my_proteome`
+3. Extract the candidate peptides from the search results into a format readable by Casanovo-DB (`annotated.mgf`).
+- `casanovo --mode=annotate --peak_path spectra.mgf --tide_dir_path search_results --output annotated.mgf`
+4. Run Casanovo-DB:
+- `casanovo --mode=db --peak_path annotated.mgf --output casanovo_db_result.mztab`
+
+
+The resulting file is in mztab format, similar to that produced by Casanovo's `sequence` command, except that there are scores for every candidate peptide against their respective spectrum (pairs as specified in `annotated.mgf`).
+
 **_De Novo_ Mass Spectrometry Peptide Sequencing with a Transformer Model**
 
 ![image](https://user-images.githubusercontent.com/32707537/152622912-ca87da20-a64c-4e3f-9ca1-721c6b0d9c64.png)

diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -18,6 +18,8 @@
 import torch
 import tqdm
 import yaml
+import pandas as pd
+import pyteomics.mgf as mgf
 from pytorch_lightning.lite import LightningLite
 
 from . import __version__
@@ -39,8 +41,12 @@
     '- "train" will train a model (from scratch or by\ncontinuing training a '
     "previously trained model).\n"
     '- "eval" will evaluate the performance of a\ntrained model using '
-    "previously acquired spectrum\nannotations.",
-    type=click.Choice(["denovo", "train", "eval", "db"]),
+    "previously acquired spectrum\nannotations.\n"
+    '- "db" will use Casanovo-DB to score\nspectra against a database of\n'
+    "candidates specified in an .mgf\ncreated with annotate mode.\n"
+    '- "annotate" will use tide-search results\nto annotate an .mgf '
+    "file with candidate peptides.",
+    type=click.Choice(["denovo", "train", "eval", "db", "annotate"]),
 )
 @click.option(
     "--model",
@@ -51,7 +57,9 @@
     "--peak_path",
     required=True,
     help="The file path with peak files for predicting peptide sequences or "
-    "training Casanovo.",
+    "training Casanovo. If mode is 'db', this should be the path to the "
+    "annotated .mgf file. If mode is 'annotate', this should be the path to the "
+    ".mgf file you wish to annotate.",
 )
 @click.option(
     "--peak_path_val",
@@ -70,13 +78,20 @@
     "(optionally) prediction results (extension: .mztab).",
     type=click.Path(dir_okay=False),
 )
+@click.option(
+    "--tide_dir_path",
+    help="The directory containing the results of a successful tide search. "
+    "Used in annotate mode to annotate an .mgf file with candidate peptides.",
+    type=click.Path(exists=True, file_okay=False),
+)
 def main(
     mode: str,
     model: Optional[str],
     peak_path: str,
     peak_path_val: Optional[str],
     config: Optional[str],
     output: Optional[str],
+    tide_dir_path: Optional[str],
 ):
     """
     \b
@@ -155,6 +170,7 @@ def main(
     logger.debug("peak_path_val = %s", peak_path_val)
     logger.debug("config = %s", config.file)
     logger.debug("output = %s", output)
+    logger.debug("tide_dir_path = %s", tide_dir_path)
     for key, value in config.items():
         logger.debug("%s = %s", str(key), str(value))
 
@@ -175,6 +191,9 @@ def main(
         logger.info("Database seach with casanovo.")
         writer = ms_io.MztabWriter(f"{output}.mztab")
         model_runner.db_search(peak_path, model, config, writer)
+    elif mode == "annotate":
+        logger.info("Annotate .mgf file with candidate peptides.")
+        create_mgf_from_tide(tide_dir_path, peak_path, output)
 
 
 def _get_model_weights() -> str:
@@ -280,5 +299,116 @@ def _get_model_weights() -> str:
             )
 
 
+def _normalize_mods(seq: str) -> str:
+    """
+    Turns tide-style modifications into the format used by Casanovo-DB.
+
+        Parameters
+        ----------
+        seq : str
+            The peptide sequence with tide-style modifications.
+
+        Returns
+        -------
+        str
+            The peptide sequence with Casanovo-DB-style modifications.
+    """
+    seq = seq.replace("M[15.99]", "M+15.995")
+    seq = seq.replace("C", "C+57.021")
+    seq = seq.replace("N[0.98]", "N+0.984")
+    seq = seq.replace("Q[0.98]", "Q+0.984")
+    seq = re.sub(r"(.*)\[42\.01\]", r"+42.011\1", seq)
+    seq = re.sub(r"(.*)\[43\.01\]", r"+43.006\1", seq)
+    seq = re.sub(r"(.*)\[\-17\.03\]", r"-17.027\1", seq)
+    seq = re.sub(r"(.*)\[25\.98\]", r"+43.006-17.027\1", seq)
+    return seq
+
+
+def create_mgf_from_tide(
+    tide_dir_path: str, mgf_file: str, output_file: str
+) -> None:
+    """
+    Accepts a directory containing the results of a successful tide search, and an .mgf file containing MS/MS spectra.
+    The .mgf file is then annotated in the SEQ field with all of the candidate peptides for each spectrum, as well as their target/decoy status.
+    This annotated .mgf can be given directly to Casanovo-DB to perfrom a database search.
+
+        Parameters
+        ----------
+        tide_dir_path : str
+            Path to the directory containing the results of a successful tide search.
+        mgf_file : str
+            Path to the .mgf file containing MS/MS spectra.
+        output_file : str
+            Path to where the annotated .mgf will be written.
+
+    """
+    # Get paths to tide search text files
+    tdf_path = os.path.join(tide_dir_path, "tide-search.target.txt")
+    ddf_path = os.path.join(tide_dir_path, "tide-search.decoy.txt")
+    try:
+        target_df = pd.read_csv(
+            tdf_path, sep="\t", usecols=["scan", "sequence", "target/decoy"]
+        )
+        decoy_df = pd.read_csv(
+            ddf_path, sep="\t", usecols=["scan", "sequence", "target/decoy"]
+        )
+    except FileNotFoundError as e:
+        logger.error(
+            "Could not find tide search results in the specified directory. "
+            "Please ensure that the directory contains the following files: "
+            "tide-search.target.txt and tide-search.decoy.txt"
+        )
+        raise e
+
+    logger.info(
+        "Successfully read tide search results from %s.", tide_dir_path
+    )
+
+    df = pd.concat([target_df, decoy_df])
+    scan_groups = df.groupby("scan")[["sequence", "target/decoy"]]
+
+    scan_map = {}
+
+    for scan, item in scan_groups:
+        target_candidate_list = list(
+            map(
+                _normalize_mods,
+                item.groupby("target/decoy")["sequence"].apply(list)["target"],
+            )
+        )
+        decoy_candidate_list = list(
+            map(
+                _normalize_mods,
+                item.groupby("target/decoy")["sequence"].apply(list)["decoy"],
+            )
+        )
+        decoy_candidate_list = list(
+            map(lambda x: "decoy_" + str(x), decoy_candidate_list)
+        )
+        scan_map[scan] = target_candidate_list + decoy_candidate_list
+
+    all_spec = []
+    for idx, spec_dict in enumerate(
+        mgf.read(mgf_file)
+    ):  #! WILL NEED TO BE CHANGED FOR OTHER ENCODINGS OF SCAN
+        scan = int(
+            re.search(r"scan=(\d+)", spec_dict["params"]["title"]).group(1)
+        )
+        try:
+            spec_dict["params"]["seq"] = ",".join(list(scan_map[scan]))
+            all_spec.append(spec_dict)
+        except KeyError as e:
+            # No need to do anything if the scan is not found in the scan map
+            pass
+    try:
+        mgf.write(all_spec, output_file)
+        logger.info("Annotated .mgf file written to %s.", output_file)
+    except Exception as e:
+        print(
+            f"Write to {output_file} failed. Check if the file path is correct."
+        )
+        print(e)
+
+
 if __name__ == "__main__":
     main()
diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py
@@ -945,7 +945,8 @@ class DBSpec2Pep(Spec2Pep):
     """
     Inherits Spec2Pep
 
-    Hijacks teacher-forcing implemented in Spec2Pep and uses it to predict scores between a spectra and associated peptide
+    Hijacks teacher-forcing implemented in Spec2Pep and uses it to predict scores between a spectra and associated peptide.
+    Input format is .mgf, with comma-separated targets and decoys in the SEQ field. Decoys should have a prefix of "decoy_".
     """
 
     num_pairs = 1024
@@ -1009,7 +1010,7 @@ def smart_batch_gen(self, batch):
                     )
                 )
             )
-        # continually grab n items from all_psm until list is exhausted
+        # continually grab num_pairs items from all_psm until list is exhausted
         while len(all_psm) > 0:
             batch = all_psm[: self.num_pairs]
             all_psm = all_psm[self.num_pairs :]
@@ -1029,7 +1030,17 @@ def on_predict_epoch_end(self, results) -> None:
             return
         results = np.array(results, dtype=object).squeeze((0))
         with open(self.out_writer.filename, "a") as out_f:
-            csv_writer = csv.writer(out_f)
+            csv_writer = csv.writer(out_f, delimiter="\t")
+            # Write a header
+            csv_writer.writerow(
+                (
+                    "index",
+                    "peptide",
+                    "target",
+                    "score",
+                    "per_aa_scores",
+                )
+            )
             for group in results:
                 for batch in group:
                     for index, t_or_d, peptide, score, per_aa_scores in list(
@@ -1056,7 +1067,8 @@ def calc_match_score(
 ) -> List[float]:
     """
     Take in teacher-forced scoring of amino acids of the peptides (in a batch) and use the truth labels
-    to calculate a score between the input spectra and associated peptide.
+    to calculate a score between the input spectra and associated peptide. The score is the geometric
+    mean of the AA probabilities
 
         Parameters
         ----------