fix detection of start/stop codons for 'complete' input

* fix/complete-start: run prodigal without meta option allow empty sequences include start and stop codons in output add last nights benchmarks to the evaluation README add quality measuring scripts compare to negative strand for dna_start_t_withstop invert s_save to avoid negative numbers negate strcmp dropme: start with start states infinity is sometimes smaller in FGS unfix start1/stop1 filenames include reverse stop codon in meta output fix ACGT typo to correct complete predictions revert "extend genes in complete genomes to start/stop codons" use corrected dna_start_t extend genes in complete genomes to start/stop codons
unipept · Oct 6, 2021 · 2757c4e · 2757c4e
2 parents 7bddb73 + 437ef43
commit 2757c4e
Show file tree

Hide file tree

Showing 10 changed files with 228 additions and 29 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,8 @@
 /target
+FragGeneScan
+FGS+
+prodigal
+*.ffn
+*.faa
+*.gff
+*.csv
diff --git a/meta/evaluation/.gitignore b/meta/evaluation/.gitignore
@@ -0,0 +1,2 @@
+*.fasta
+*.txt
diff --git a/meta/evaluation/README.md b/meta/evaluation/README.md
@@ -0,0 +1,67 @@
+# Evaluation of FGS, FGSrs, FGS+, Prodigal on whole genomes
+
+Source assembly: https://www.ebi.ac.uk/ena/browser/view/GCA_001628815?show=chromosomes
+
+The 'FASTA' download `ena_data_20210917-1328.fasta` is the complete assembly.
+
+The 'TEXT' download `ena_data_20210917-1328.txt` also contains annotated genes.
+
+## Create the annotations and lengths files
+
+Execute `annotations.py`.
+
+## Create the FGS/FGS+ files (from .aa)
+
+(swap directories to execute these)
+
+```sh
+cd path/to/FGS
+./FragGeneScan -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS -t complete -w 1
+./FGS+ -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS+ -t complete -w 1
+cd -
+rm FGS.out FGS.ffn
+sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS.faa > FGS.csv
+sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS+.faa > FGS+.csv
+```
+
+## Create the FGSrs/Prodigal files (from .gff)
+
+```sh
+FragGeneScanRs -s ena_data_20210917-1328.fasta -g FGSrs.gff -t complete -w 1
+prodigal -i ena_data_20210917-1328.fasta -f gff -o prodigal.gff
+grep -v '^#' FGSrs.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > FGSrs.csv
+grep -v '^#' prodigal.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > prodigal.csv
+```
+
+## Print comparison table
+
+Execute `rates.py`.
+
+## Timings for these predictions using [hyperfine](https://github.com/sharkdp/hyperfine)
+
+Run in the FGS or FGS+ directory (for the training files).
+
+```sh
+hyperfine 'FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1' \
+          'FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1' \
+          'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' \
+          'prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff'
+```
+
+```
+Benchmark #1: ./FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1
+  Time (mean ± σ):      3.797 s ±  0.006 s    [User: 3.413 s, System: 0.348 s]
+  Range (min … max):    3.792 s …  3.807 s    5 runs
+
+Benchmark #2: ./FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1
+  Time (mean ± σ):     369.979 s ± 25.774 s    [User: 367.679 s, System: 0.517 s]
+  Range (min … max):   353.713 s … 415.649 s    5 runs
+
+Benchmark #1: FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1
+  Time (mean ± σ):      1.703 s ±  0.014 s    [User: 1.395 s, System: 0.275 s]
+  Range (min … max):    1.684 s …  1.719 s    5 runs
+
+Benchmark #4: prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff
+  Time (mean ± σ):      8.533 s ±  0.038 s    [User: 8.453 s, System: 0.047 s]
+  Range (min … max):    8.493 s …  8.573 s    5 runs
+```
diff --git a/meta/evaluation/annotations.py b/meta/evaluation/annotations.py
@@ -0,0 +1,18 @@
+with open('ena_data_20210917-1328.txt') as f, open('annotations.csv', 'w') as a, open('readlengths.csv', 'w') as l:
+    for line in f:
+        if line.startswith('AC'):
+            accession = line[2:].strip()[:-1]
+        elif line.startswith('FT   gene'):
+            span = line[9:].strip()
+            if span.startswith('complement('):
+                span = span[11:-1]
+                strand = '-'
+            else:
+                strand = '+'
+            start, end = span.split('..')
+            print(accession, start, end, strand, sep=',', file=a)
+        elif line.startswith('SQ'):
+            parts = line[2:].strip().split(' ')
+            assert parts[0] == "Sequence"
+            print(accession, parts[1], sep=',', file=l)
+            assert parts[2].startswith("BP")
diff --git a/meta/evaluation/rates.py b/meta/evaluation/rates.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+from collections import namedtuple
+from operator import itemgetter
+
+Annotation = namedtuple('Annotation', ['name', 'start', 'end', 'strand'])
+Event = namedtuple('Event', ['location', 'annotation', 'prediction'])
+
+def parse_readlengths(filename):
+	with open(filename) as f:
+		for line in f:
+			name, length = line.strip().split(',')
+			yield name, int(length)
+
+def parse_csv(filename):
+	with open(filename) as f:
+		for line in f:
+			name, start, end, strand = line.strip().split(',')
+			yield Annotation(name, int(start), int(end), 1 if strand == '+' else -1)
+
+def events(read_lengths, annotations, predictions):
+	names = dict()
+	total = 0
+	for name, length in parse_readlengths(read_lengths):
+		names[name] = total
+		total += length
+
+	for name, start, end, strand in parse_csv(annotations):
+		yield Event(names[name] + start, strand, None)
+		yield Event(names[name] + end, 0, None)
+
+	for name, start, end, strand in parse_csv(predictions):
+		yield Event(names[name] + start, None, strand)
+		yield Event(names[name] + end, None, 0)
+
+def rates(read_lengths, annotations, predictions):
+	rates = dict(tp=0, fp=0, tn=0, fn=0)
+	cl = 0 # current location
+	ca = 0 # current annotation
+	cp = 0 # current prediction
+	for l, a, p in sorted(events(read_lengths, annotations, predictions), key=itemgetter(0)):
+		if ca == 0 and cp == 0:
+			rates['tn'] += l - cl
+		elif ca == cp:
+			rates['tp'] += l - cl
+		elif ca == 0 and cp != 0:
+			rates['fp'] += l - cl
+		elif ca != 0 and cp == 0:
+			rates['fn'] += l - cl
+		else: # different strands
+			rates['fp'] += l - cl
+
+		if a is not None: ca = a
+		if p is not None: cp = p
+		cl = l
+	return rates
+
+body = '{:<10}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}'
+head = '{:<10}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}'
+print(head.format('tool', 'TP', 'FP', 'TN', 'FN', 'precision', 'sensitivity', 'specificity', 'NPV', 'MCC'))
+for tool in ['FGS', 'FGS+', 'prodigal', 'FGSrs']:
+	r = rates('readlengths.csv', 'annotations.csv', f'{tool}.csv')
+	tp, fp, tn, fn = r['tp'], r['fp'], r['tn'], r['fn']
+	t = tp + fp + tn + fn
+	print(body.format(
+		tool,
+		tp / t,
+		fp / t,
+		tn / t,
+		fn / t,
+		tp / (tp + fp),
+		tp / (tp + fn),
+		tn / (tn + fp),
+		tn / (tn + fn),
+		(tp * tn - fp * fn) / ((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))**0.5
+	))
diff --git a/meta/out-to-gff.awk b/meta/out-to-gff.awk
@@ -0,0 +1,11 @@
+BEGIN { print "##gff-version 3"; }
+{
+    s = substr($1, 1, 1)
+    if (s == ">") {
+        seqid = substr($1, 2)
+    } else {
+        s = split($0, t, "\t")
+        id = "ID=" seqid "_" t[1] "_" t[2] "_" t[3] ";product=predicted protein"
+        print seqid "\tFGS\tCDS\t" t[1] "\t" t[2] "\t.\t" t[3] "\t" int(t[4] - 1) "\t" id
+    }
+}
diff --git a/src/bin/FragGeneScanRs.rs b/src/bin/FragGeneScanRs.rs
@@ -23,6 +23,7 @@ use rayon::ThreadPoolBuilder;
 
 extern crate frag_gene_scan_rs;
 use frag_gene_scan_rs::dna::{count_cg_content, Nuc};
+use frag_gene_scan_rs::gene;
 use frag_gene_scan_rs::hmm;
 use frag_gene_scan_rs::viterbi::viterbi;
 
@@ -236,13 +237,17 @@ fn run<R: Read + Send, W: WritingBuffer + Send>(
                 let fasta::OwnedRecord { mut head, seq } = record?;
                 head = head.into_iter().take_while(u8::is_ascii_graphic).collect();
                 let nseq: Vec<Nuc> = seq.into_iter().map(Nuc::from).collect();
-                let read_prediction = viterbi(
-                    &global,
-                    &locals[count_cg_content(&nseq)],
-                    head,
-                    nseq,
-                    whole_genome,
-                );
+                let read_prediction = if nseq.is_empty() {
+                    gene::ReadPrediction::new(head)
+                } else {
+                    viterbi(
+                        &global,
+                        &locals[count_cg_content(&nseq)],
+                        head,
+                        nseq,
+                        whole_genome,
+                    )
+                };
                 if meta_buffer.is_some() {
                     read_prediction.meta(&mut metabuf)?;
                 }

diff --git a/src/gene.rs b/src/gene.rs
@@ -74,7 +74,6 @@ impl ReadPrediction {
 
 pub struct Gene {
     pub start: usize,
-    pub metastart: usize,
     pub end: usize,
     pub frame: usize,
     pub score: f64,
@@ -89,7 +88,7 @@ impl Gene {
         buf.append(
             &mut format!(
                 "{}\t{}\t{}\t{}\t{:.6}\tI:{}\tD:{}\n",
-                self.metastart,
+                self.start,
                 self.end,
                 if self.forward_strand { '+' } else { '-' },
                 self.frame,
@@ -112,12 +111,12 @@ impl Gene {
             &mut format!(
                 "{}\tFGS\tCDS\t{}\t{}\t.\t{}\t{}\tID={}_{}_{}_{};product=predicted protein\n",
                 head,
-                self.metastart,
+                self.start,
                 self.end,
                 if self.forward_strand { '+' } else { '-' },
                 self.frame - 1,
                 head,
-                self.metastart,
+                self.start,
                 self.end,
                 if self.forward_strand { '+' } else { '-' }
             )

diff --git a/src/hmm.rs b/src/hmm.rs
@@ -149,8 +149,8 @@ pub fn get_train_from_file(
     read_noncoding(&mut locals, train_dir.join("noncoding"))?;
     read_start(&mut locals, train_dir.join("start"))?;
     read_stop(&mut locals, train_dir.join("stop"))?;
-    read_start1(&mut locals, train_dir.join("start1"))?;
-    read_stop1(&mut locals, train_dir.join("stop1"))?;
+    read_start1(&mut locals, train_dir.join("stop1"))?; // keep FGS naming scheme
+    read_stop1(&mut locals, train_dir.join("start1"))?; // keep FGS naming scheme
     read_pwm(&mut locals, train_dir.join("pwm"))?;
 
     Ok((global, locals))