diff --git a/.gitignore b/.gitignore index ea8c4bf..26b21af 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,8 @@ /target +FragGeneScan +FGS+ +prodigal +*.ffn +*.faa +*.gff +*.csv diff --git a/meta/evaluation/.gitignore b/meta/evaluation/.gitignore new file mode 100644 index 0000000..89e0fe0 --- /dev/null +++ b/meta/evaluation/.gitignore @@ -0,0 +1,2 @@ +*.fasta +*.txt diff --git a/meta/evaluation/README.md b/meta/evaluation/README.md new file mode 100644 index 0000000..6ec1e10 --- /dev/null +++ b/meta/evaluation/README.md @@ -0,0 +1,67 @@ +# Evaluation of FGS, FGSrs, FGS+, Prodigal on whole genomes + +Source assembly: https://www.ebi.ac.uk/ena/browser/view/GCA_001628815?show=chromosomes + +The 'FASTA' download `ena_data_20210917-1328.fasta` is the complete assembly. + +The 'TEXT' download `ena_data_20210917-1328.txt` also contains annotated genes. + +## Create the annotations and lengths files + +Execute `annotations.py`. + +## Create the FGS/FGS+ files (from .aa) + +(swap directories to execute these) + +```sh +cd path/to/FGS +./FragGeneScan -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS -t complete -w 1 +./FGS+ -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS+ -t complete -w 1 +cd - +rm FGS.out FGS.ffn +sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS.faa > FGS.csv +sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS+.faa > FGS+.csv +``` + +## Create the FGSrs/Prodigal files (from .gff) + +```sh +FragGeneScanRs -s ena_data_20210917-1328.fasta -g FGSrs.gff -t complete -w 1 +prodigal -i ena_data_20210917-1328.fasta -f gff -o prodigal.gff +grep -v '^#' FGSrs.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > FGSrs.csv +grep -v '^#' prodigal.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > prodigal.csv +``` + +## Print comparison table + +Execute `rates.py`. + +## Timings for these predictions using [hyperfine](https://github.com/sharkdp/hyperfine) + +Run in the FGS or FGS+ directory (for the training files). + +```sh +hyperfine 'FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1' \ + 'FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1' \ + 'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' \ + 'prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff' +``` + +``` +Benchmark #1: ./FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1 + Time (mean ± σ): 3.797 s ± 0.006 s [User: 3.413 s, System: 0.348 s] + Range (min … max): 3.792 s … 3.807 s 5 runs + +Benchmark #2: ./FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1 + Time (mean ± σ): 369.979 s ± 25.774 s [User: 367.679 s, System: 0.517 s] + Range (min … max): 353.713 s … 415.649 s 5 runs + +Benchmark #1: FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1 + Time (mean ± σ): 1.703 s ± 0.014 s [User: 1.395 s, System: 0.275 s] + Range (min … max): 1.684 s … 1.719 s 5 runs + +Benchmark #4: prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff + Time (mean ± σ): 8.533 s ± 0.038 s [User: 8.453 s, System: 0.047 s] + Range (min … max): 8.493 s … 8.573 s 5 runs +``` diff --git a/meta/evaluation/annotations.py b/meta/evaluation/annotations.py new file mode 100644 index 0000000..c76c8d1 --- /dev/null +++ b/meta/evaluation/annotations.py @@ -0,0 +1,18 @@ +with open('ena_data_20210917-1328.txt') as f, open('annotations.csv', 'w') as a, open('readlengths.csv', 'w') as l: + for line in f: + if line.startswith('AC'): + accession = line[2:].strip()[:-1] + elif line.startswith('FT gene'): + span = line[9:].strip() + if span.startswith('complement('): + span = span[11:-1] + strand = '-' + else: + strand = '+' + start, end = span.split('..') + print(accession, start, end, strand, sep=',', file=a) + elif line.startswith('SQ'): + parts = line[2:].strip().split(' ') + assert parts[0] == "Sequence" + print(accession, parts[1], sep=',', file=l) + assert parts[2].startswith("BP") diff --git a/meta/evaluation/rates.py b/meta/evaluation/rates.py new file mode 100755 index 0000000..a0ddfb8 --- /dev/null +++ b/meta/evaluation/rates.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +from collections import namedtuple +from operator import itemgetter + +Annotation = namedtuple('Annotation', ['name', 'start', 'end', 'strand']) +Event = namedtuple('Event', ['location', 'annotation', 'prediction']) + +def parse_readlengths(filename): + with open(filename) as f: + for line in f: + name, length = line.strip().split(',') + yield name, int(length) + +def parse_csv(filename): + with open(filename) as f: + for line in f: + name, start, end, strand = line.strip().split(',') + yield Annotation(name, int(start), int(end), 1 if strand == '+' else -1) + +def events(read_lengths, annotations, predictions): + names = dict() + total = 0 + for name, length in parse_readlengths(read_lengths): + names[name] = total + total += length + + for name, start, end, strand in parse_csv(annotations): + yield Event(names[name] + start, strand, None) + yield Event(names[name] + end, 0, None) + + for name, start, end, strand in parse_csv(predictions): + yield Event(names[name] + start, None, strand) + yield Event(names[name] + end, None, 0) + +def rates(read_lengths, annotations, predictions): + rates = dict(tp=0, fp=0, tn=0, fn=0) + cl = 0 # current location + ca = 0 # current annotation + cp = 0 # current prediction + for l, a, p in sorted(events(read_lengths, annotations, predictions), key=itemgetter(0)): + if ca == 0 and cp == 0: + rates['tn'] += l - cl + elif ca == cp: + rates['tp'] += l - cl + elif ca == 0 and cp != 0: + rates['fp'] += l - cl + elif ca != 0 and cp == 0: + rates['fn'] += l - cl + else: # different strands + rates['fp'] += l - cl + + if a is not None: ca = a + if p is not None: cp = p + cl = l + return rates + +body = '{:<10}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}' +head = '{:<10}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}' +print(head.format('tool', 'TP', 'FP', 'TN', 'FN', 'precision', 'sensitivity', 'specificity', 'NPV', 'MCC')) +for tool in ['FGS', 'FGS+', 'prodigal', 'FGSrs']: + r = rates('readlengths.csv', 'annotations.csv', f'{tool}.csv') + tp, fp, tn, fn = r['tp'], r['fp'], r['tn'], r['fn'] + t = tp + fp + tn + fn + print(body.format( + tool, + tp / t, + fp / t, + tn / t, + fn / t, + tp / (tp + fp), + tp / (tp + fn), + tn / (tn + fp), + tn / (tn + fn), + (tp * tn - fp * fn) / ((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))**0.5 + )) diff --git a/meta/out-to-gff.awk b/meta/out-to-gff.awk new file mode 100644 index 0000000..41b02ef --- /dev/null +++ b/meta/out-to-gff.awk @@ -0,0 +1,11 @@ +BEGIN { print "##gff-version 3"; } +{ + s = substr($1, 1, 1) + if (s == ">") { + seqid = substr($1, 2) + } else { + s = split($0, t, "\t") + id = "ID=" seqid "_" t[1] "_" t[2] "_" t[3] ";product=predicted protein" + print seqid "\tFGS\tCDS\t" t[1] "\t" t[2] "\t.\t" t[3] "\t" int(t[4] - 1) "\t" id + } +} \ No newline at end of file diff --git a/src/bin/FragGeneScanRs.rs b/src/bin/FragGeneScanRs.rs index 14be4f7..6d37d2c 100644 --- a/src/bin/FragGeneScanRs.rs +++ b/src/bin/FragGeneScanRs.rs @@ -23,6 +23,7 @@ use rayon::ThreadPoolBuilder; extern crate frag_gene_scan_rs; use frag_gene_scan_rs::dna::{count_cg_content, Nuc}; +use frag_gene_scan_rs::gene; use frag_gene_scan_rs::hmm; use frag_gene_scan_rs::viterbi::viterbi; @@ -236,13 +237,17 @@ fn run( let fasta::OwnedRecord { mut head, seq } = record?; head = head.into_iter().take_while(u8::is_ascii_graphic).collect(); let nseq: Vec = seq.into_iter().map(Nuc::from).collect(); - let read_prediction = viterbi( - &global, - &locals[count_cg_content(&nseq)], - head, - nseq, - whole_genome, - ); + let read_prediction = if nseq.is_empty() { + gene::ReadPrediction::new(head) + } else { + viterbi( + &global, + &locals[count_cg_content(&nseq)], + head, + nseq, + whole_genome, + ) + }; if meta_buffer.is_some() { read_prediction.meta(&mut metabuf)?; } diff --git a/src/gene.rs b/src/gene.rs index f51710d..ce0dbef 100644 --- a/src/gene.rs +++ b/src/gene.rs @@ -74,7 +74,6 @@ impl ReadPrediction { pub struct Gene { pub start: usize, - pub metastart: usize, pub end: usize, pub frame: usize, pub score: f64, @@ -89,7 +88,7 @@ impl Gene { buf.append( &mut format!( "{}\t{}\t{}\t{}\t{:.6}\tI:{}\tD:{}\n", - self.metastart, + self.start, self.end, if self.forward_strand { '+' } else { '-' }, self.frame, @@ -112,12 +111,12 @@ impl Gene { &mut format!( "{}\tFGS\tCDS\t{}\t{}\t.\t{}\t{}\tID={}_{}_{}_{};product=predicted protein\n", head, - self.metastart, + self.start, self.end, if self.forward_strand { '+' } else { '-' }, self.frame - 1, head, - self.metastart, + self.start, self.end, if self.forward_strand { '+' } else { '-' } ) diff --git a/src/hmm.rs b/src/hmm.rs index d6279b7..3fc433b 100644 --- a/src/hmm.rs +++ b/src/hmm.rs @@ -149,8 +149,8 @@ pub fn get_train_from_file( read_noncoding(&mut locals, train_dir.join("noncoding"))?; read_start(&mut locals, train_dir.join("start"))?; read_stop(&mut locals, train_dir.join("stop"))?; - read_start1(&mut locals, train_dir.join("start1"))?; - read_stop1(&mut locals, train_dir.join("stop1"))?; + read_start1(&mut locals, train_dir.join("stop1"))?; // keep FGS naming scheme + read_stop1(&mut locals, train_dir.join("start1"))?; // keep FGS naming scheme read_pwm(&mut locals, train_dir.join("pwm"))?; Ok((global, locals)) diff --git a/src/viterbi.rs b/src/viterbi.rs index cf6e5a6..43608a4 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -33,7 +33,7 @@ pub fn forward( for _ in 0..seq.len() { alpha.push([0.0; hmm::State::COUNT]); - path.push([None; hmm::State::COUNT]); + path.push([Some(hmm::State::S); hmm::State::COUNT]); } alpha[0].copy_from_slice(&global.pi); for i in &mut alpha[0] { @@ -356,7 +356,7 @@ pub fn forward( local.tr_e[i + 60 - t][trinucleotide(seq.get(i..).unwrap()).unwrap_or(0)]; } if t < 60 { - start_freq *= 58.0 / (t - 3 + 1) as f64 + start_freq *= 58.0 / (t.saturating_sub(3) + 1) as f64 } modify_border_dist(&mut alpha[t + 2][hmm::State::E], &local.dist_e, start_freq); } @@ -432,7 +432,7 @@ pub fn forward( if t < seq.len() - 2 && (seq[t] == A || seq[t] == G || seq[t] == T) - && seq[t + 1] == A + && seq[t + 1] == T && seq[t + 2] == G { alpha[t][hmm::State::S] = f64::INFINITY; @@ -616,9 +616,10 @@ fn build_genes( dna.push(seq[t]); dna_start_t_withstop = t + 1; dna_start_t = t + 1; - if vpath[t] == hmm::State::M1 || vpath[t] == hmm::State::M4r { + if vpath[t] == hmm::State::M1r || vpath[t] == hmm::State::M4r { if t > 2 { dna_start_t_withstop = t - 2; + dna.splice(0..0, seq[t - 3..t].iter().cloned()); } } @@ -657,20 +658,21 @@ fn build_genes( if codon_start == 1 { if start_t == dna_start_t as isize - 3 { dna_start_t -= 3; + dna.splice(0..0, seq[dna_start_t - 1..dna_start_t + 2].iter().cloned()); } if whole_genome { // add refinement of the start codons here let start_old = start_t as usize; - let mut codon = &seq[start_old..start_old + 3]; + let mut codon = &seq[start_old - 1..start_old + 2]; let mut s = 0; // find the optimal start codon within 30bp up- and downstream of start codon let mut e_save = 0.0; let mut s_save = 0; - while !(codon != [T, A, A] || codon != [T, A, G] || codon != [T, G, A]) + while !(codon == [T, A, A] || codon == [T, A, G] || codon == [T, G, A]) && start_old >= 1 + s + 35 { - if codon != [A, T, G] || codon != [G, T, G] || codon != [T, T, G] { + if codon == [A, T, G] || codon == [G, T, G] || codon == [T, T, G] { let utr = &seq[start_old - 1 - s - 30..start_old - 1 - s - 30 + 63]; let mut freq_sum = 0.0; for j in 0..utr.len() - 2 { @@ -682,19 +684,28 @@ fn build_genes( s_save = 0; } else if freq_sum < e_save { e_save = freq_sum; - s_save = -(s as isize); // posivite chain, upstream s_save = -1 * 3 + s_save = s; // posivite chain, upstream s_save = -1 * 3 } } s += 3; codon = &seq[start_old - 1 - s..start_old - 1 - s + 3]; - - dna_start_t += s_save as usize; } + dna_start_t -= s_save; + dna.splice( + 0..0, + seq[dna_start_t - 1..dna_start_t + s_save - 1] + .iter() + .cloned(), + ); } + // add final codon + dna.push(seq[end_t - 3]); + dna.push(seq[end_t - 2]); + dna.push(seq[end_t - 1]); + read_prediction.genes.push(gene::Gene { start: dna_start_t, - metastart: dna_start_t, end: end_t, frame: frame, score: final_score, @@ -713,10 +724,10 @@ fn build_genes( // find the optimal start codon within 30bp up- and downstream of start codon let mut e_save = 0.0; let mut s_save = 0; - while !(codon != [T, A, A] || codon != [T, A, G] || codon != [T, G, A]) + while !(codon == [T, T, A] || codon == [C, T, A] || codon == [T, C, A]) && end_old - 2 + s + 35 < seq.len() { - if codon != [A, T, G] || codon != [G, T, G] || codon != [T, T, G] { + if codon == [C, A, T] || codon == [C, A, C] || codon == [C, A, A] { let utr = &seq[end_old - 1 - 2 + s - 30..end_old + s + 30]; let mut freq_sum = 0.0; for j in 0..utr.len() - 2 { @@ -730,15 +741,19 @@ fn build_genes( } } s += 3; - codon = &seq[end_old - 1 - 2 + s..end_old]; + codon = &seq[end_old - 1 - 2 + s..end_old + s]; } end_t = end_old + s_save; } + // add final codon + dna.push(seq[end_t - 3]); + dna.push(seq[end_t - 2]); + dna.push(seq[end_t - 1]); + read_prediction.genes.push(gene::Gene { start: dna_start_t_withstop, - metastart: dna_start_t, end: end_t, frame: frame, score: final_score, @@ -841,7 +856,7 @@ fn from_s_to_m( to: usize, ) { let temp_alpha = alpha[t - 1][hmm::State::S] - local.e_m[0][from2][to]; - if temp_alpha < alpha[t][hmm::State::M1] { + if temp_alpha <= alpha[t][hmm::State::M1] { alpha[t][hmm::State::M1] = temp_alpha; path[t][hmm::State::M1] = Some(hmm::State::S); }