From 2a6b0cff9fcbf95fe76efcdaa6c1114c04ab128d Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Thu, 16 Sep 2021 22:36:01 +0200 Subject: [PATCH 01/16] extend genes in complete genomes to start/stop codons tool TP FP TN FN prec sens spec NPV MCC FGS 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 FGS+ 2661256 533556 361341 109188 83.3 96.06 40.38 76.79 46.79 prodigal 2488137 117535 745673 313996 95.49 88.79 86.38 70.37 70.36 FGSrs 1917354 383036 508737 856214 83.35 69.13 57.05 37.27 23.23 --- .gitignore | 2 + src/viterbi.rs | 229 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 160 insertions(+), 71 deletions(-) diff --git a/.gitignore b/.gitignore index ea8c4bf..1948847 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ /target +FragGeneScan +FGS+ diff --git a/src/viterbi.rs b/src/viterbi.rs index cf6e5a6..3165b1e 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -1,3 +1,5 @@ +use std::cmp::Ordering::{Equal, Greater, Less}; + use strum::EnumCount; use strum::IntoEnumIterator; @@ -14,7 +16,7 @@ pub fn viterbi( ) -> gene::ReadPrediction { let (alpha, path) = forward(global, local, &seq, whole_genome); let vpath = backtrack(&alpha, path); - build_genes(&local, head, seq, whole_genome, vpath, alpha) + build_genes(head, seq, whole_genome, vpath, alpha) } pub fn forward( @@ -356,7 +358,7 @@ pub fn forward( local.tr_e[i + 60 - t][trinucleotide(seq.get(i..).unwrap()).unwrap_or(0)]; } if t < 60 { - start_freq *= 58.0 / (t - 3 + 1) as f64 + start_freq *= 58.0 / t.saturating_sub(2) as f64 } modify_border_dist(&mut alpha[t + 2][hmm::State::E], &local.dist_e, start_freq); } @@ -567,7 +569,6 @@ fn backtrack( } fn build_genes( - local: &hmm::Local, head: Vec, seq: Vec, whole_genome: bool, @@ -645,10 +646,37 @@ fn build_genes( end_t = temp_t; } + if whole_genome { + // gene must have a start and stop codon; pick the best score nearby + if codon_start == 1 { + extend( + &mut dna, + &mut dna_start_t, + &mut end_t, + true, + [[A, T, G], [G, T, G], [T, T, G]], + [[T, A, A], [T, A, G], [T, G, A]], + &seq, + &alpha, + &vpath, + ); + } else if codon_start == -1 { + extend( + &mut dna, + &mut dna_start_t, + &mut end_t, + false, + [[T, T, A], [C, T, A], [T, C, A]], + [[C, A, T], [C, A, C], [C, A, A]], + &seq, + &alpha, + &vpath, + ); + } + } + if dna.len() > gene_len { - let final_score = (alpha[end_t - 4][vpath[end_t - 4]] - - alpha[start_t as usize + 2][vpath[start_t as usize + 2]]) - / (end_t - start_t as usize - 5) as f64; + let final_score = score(start_t as usize, end_t, &alpha, &vpath); let mut frame = start_orf % 3; if frame == 0 { frame = 3 @@ -659,39 +687,6 @@ fn build_genes( dna_start_t -= 3; } - if whole_genome { - // add refinement of the start codons here - let start_old = start_t as usize; - let mut codon = &seq[start_old..start_old + 3]; - let mut s = 0; - // find the optimal start codon within 30bp up- and downstream of start codon - let mut e_save = 0.0; - let mut s_save = 0; - while !(codon != [T, A, A] || codon != [T, A, G] || codon != [T, G, A]) - && start_old >= 1 + s + 35 - { - if codon != [A, T, G] || codon != [G, T, G] || codon != [T, T, G] { - let utr = &seq[start_old - 1 - s - 30..start_old - 1 - s - 30 + 63]; - let mut freq_sum = 0.0; - for j in 0..utr.len() - 2 { - freq_sum -= local.tr_s[j] - [trinucleotide(utr.get(j..).unwrap()).unwrap_or(0)]; - } - if s == 0 { - e_save = freq_sum; - s_save = 0; - } else if freq_sum < e_save { - e_save = freq_sum; - s_save = -(s as isize); // posivite chain, upstream s_save = -1 * 3 - } - } - s += 3; - codon = &seq[start_old - 1 - s..start_old - 1 - s + 3]; - - dna_start_t += s_save as usize; - } - } - read_prediction.genes.push(gene::Gene { start: dna_start_t, metastart: dna_start_t, @@ -704,38 +699,6 @@ fn build_genes( deleted: delete.clone(), }); } else if codon_start == -1 { - if whole_genome { - // add refinement of the start codons here - // l 946 - let end_old = end_t; - let mut codon = &seq[end_t - 1 - 2..end_t]; - let mut s = 0; - // find the optimal start codon within 30bp up- and downstream of start codon - let mut e_save = 0.0; - let mut s_save = 0; - while !(codon != [T, A, A] || codon != [T, A, G] || codon != [T, G, A]) - && end_old - 2 + s + 35 < seq.len() - { - if codon != [A, T, G] || codon != [G, T, G] || codon != [T, T, G] { - let utr = &seq[end_old - 1 - 2 + s - 30..end_old + s + 30]; - let mut freq_sum = 0.0; - for j in 0..utr.len() - 2 { - // TODO stop1? (their note) - freq_sum -= local.tr_e1[j] - [trinucleotide(utr.get(j..).unwrap()).unwrap_or(0)]; - } - if s == 0 || freq_sum < e_save { - e_save = freq_sum; - s_save = s; // negative chain, s_save = s - } - } - s += 3; - codon = &seq[end_old - 1 - 2 + s..end_old]; - } - - end_t = end_old + s_save; - } - read_prediction.genes.push(gene::Gene { start: dna_start_t_withstop, metastart: dna_start_t, @@ -1004,3 +967,127 @@ fn modify_border_dist(cell: &mut f64, values: &[f64], start_freq: f64) { values[5] * (-1.0 * (start_freq - values[4]).powi(2) / (values[3]).powi(2) / 2.0).exp(); *cell -= (h_kd / (h_kd + r_kd)).max(0.01).min(0.99).ln(); } + +fn extend( + dna: &mut Vec, + left: &mut usize, + right: &mut usize, + forward: bool, + startcodons: [[Nuc; 3]; 3], + stopcodons: [[Nuc; 3]; 3], + seq: &[Nuc], + alpha: &[[f64; 29]], + vpath: &[hmm::State], +) { + if forward { + let mut c = (*left + 6).max(right.saturating_sub(30)); // stop candidate 1-based + while c < (*right + 198).min(seq.len() - 1) + && !stopcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) + { + c += 3; + } + if c < (*right + 198).min(seq.len() - 1) { + c += 3; // include stop codon + match c.cmp(right) { + Less => { + dna.truncate(dna.len() + c - *right); + } + Equal => {} + Greater => { + dna.extend(&seq[*right - 1..c - 1]); + } + } + *right = c; + } + + let mut starts = Vec::new(); + let mut c = right.saturating_sub(6).min(*left + 30); // start candidate 1-based + while c >= 3 + && c > left.saturating_sub(198) + && !stopcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) + { + if startcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) { + starts.push(c); + } + c -= 3; + } + + let mut startc = *left; + let mut maxscore = score(*left, *right, alpha, vpath); + for s in starts.into_iter() { + let nscore = score(s, *right, alpha, vpath); + if f64::INFINITY > nscore && nscore > maxscore { + startc = s; + maxscore = nscore; + } + } + match startc.cmp(left) { + Less => { + dna.splice(0..0, seq[startc - 1..*left - 1].iter().cloned()); + } + Equal => {} + Greater => { + dna.drain(0..startc - *left); + } + } + *left = startc; + } else { + let mut c = right.saturating_sub(6).min(*left + 30); // stop candidate 1-based + while c >= 3 + && c > left.saturating_sub(198) + && !stopcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) + { + c -= 3; + } + if c >= 3 && c > left.saturating_sub(198) { + c -= 3; // include stop codon + match c.cmp(left) { + Less => { + dna.splice(0..0, seq[c - 1..*left - 1].iter().cloned()); + } + Equal => {} + Greater => { + dna.drain(0..c - *left); + } + } + *left = c; + } + + let mut starts = Vec::new(); + let mut c = (*left + 6).max(right.saturating_sub(30)); // start candidate 1-based + while c < (*right + 198).min(seq.len() - 1) + && !stopcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) + { + if startcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) { + starts.push(c); + } + c += 3; + } + + let mut startc = *right; + let mut maxscore = score(*left, *right, alpha, vpath); + for s in starts.into_iter() { + let nscore = score(*left, s, alpha, vpath); + if f64::INFINITY > nscore && nscore > maxscore { + startc = s; + maxscore = nscore; + } + } + match startc.cmp(right) { + Less => { + dna.truncate(dna.len() + startc - *right); + } + Equal => {} + Greater => { + dna.extend(&seq[*right - 1..startc - 1]); + } + } + *right = startc; + } +} + +fn score(start: usize, stop: usize, alpha: &[[f64; 29]], vpath: &[hmm::State]) -> f64 { + // start and stop are 1-based (-1), exclude start (+3) and stop (-3) codons + (alpha[stop - 4][vpath[stop - 4]] - alpha[start + 2][vpath[start + 2]]) + / (stop - start - 5) as f64 +} From 5498c802a95a524081fec839dea04e5875f776a5 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Thu, 16 Sep 2021 23:11:08 +0200 Subject: [PATCH 02/16] use corrected dna_start_t tool TP FP TN FN prec sens spec NPV MCC FGS 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 FGS+ 2661256 533556 361341 109188 83.3 96.06 40.38 76.79 46.79 prodigal 2488137 117535 745673 313996 95.49 88.79 86.38 70.37 70.36 FGSrs 1917354 383036 508737 856214 83.35 69.13 57.05 37.27 23.23 --- src/viterbi.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/viterbi.rs b/src/viterbi.rs index 3165b1e..9852c51 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -700,7 +700,11 @@ fn build_genes( }); } else if codon_start == -1 { read_prediction.genes.push(gene::Gene { - start: dna_start_t_withstop, + start: if whole_genome { + dna_start_t + } else { + dna_start_t_withstop + }, metastart: dna_start_t, end: end_t, frame: frame, From 2fc2174a12ae545593d5b3f31eca643e3178fe93 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Fri, 17 Sep 2021 11:53:37 +0200 Subject: [PATCH 03/16] revert "extend genes in complete genomes to start/stop codons" tool TP FP TN FN prec sens spec NPV MCC FGS 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 FGS+ 2661256 533556 361341 109188 83.3 96.06 40.38 76.79 46.79 prodigal 2488137 117535 745673 313996 95.49 88.79 86.38 70.37 70.36 FGSrs 2601595 527580 367165 169001 83.14 93.9 41.04 68.48 42.47 --- src/viterbi.rs | 235 +++++++++++++++---------------------------------- 1 file changed, 72 insertions(+), 163 deletions(-) diff --git a/src/viterbi.rs b/src/viterbi.rs index 9852c51..cf6e5a6 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -1,5 +1,3 @@ -use std::cmp::Ordering::{Equal, Greater, Less}; - use strum::EnumCount; use strum::IntoEnumIterator; @@ -16,7 +14,7 @@ pub fn viterbi( ) -> gene::ReadPrediction { let (alpha, path) = forward(global, local, &seq, whole_genome); let vpath = backtrack(&alpha, path); - build_genes(head, seq, whole_genome, vpath, alpha) + build_genes(&local, head, seq, whole_genome, vpath, alpha) } pub fn forward( @@ -358,7 +356,7 @@ pub fn forward( local.tr_e[i + 60 - t][trinucleotide(seq.get(i..).unwrap()).unwrap_or(0)]; } if t < 60 { - start_freq *= 58.0 / t.saturating_sub(2) as f64 + start_freq *= 58.0 / (t - 3 + 1) as f64 } modify_border_dist(&mut alpha[t + 2][hmm::State::E], &local.dist_e, start_freq); } @@ -569,6 +567,7 @@ fn backtrack( } fn build_genes( + local: &hmm::Local, head: Vec, seq: Vec, whole_genome: bool, @@ -646,37 +645,10 @@ fn build_genes( end_t = temp_t; } - if whole_genome { - // gene must have a start and stop codon; pick the best score nearby - if codon_start == 1 { - extend( - &mut dna, - &mut dna_start_t, - &mut end_t, - true, - [[A, T, G], [G, T, G], [T, T, G]], - [[T, A, A], [T, A, G], [T, G, A]], - &seq, - &alpha, - &vpath, - ); - } else if codon_start == -1 { - extend( - &mut dna, - &mut dna_start_t, - &mut end_t, - false, - [[T, T, A], [C, T, A], [T, C, A]], - [[C, A, T], [C, A, C], [C, A, A]], - &seq, - &alpha, - &vpath, - ); - } - } - if dna.len() > gene_len { - let final_score = score(start_t as usize, end_t, &alpha, &vpath); + let final_score = (alpha[end_t - 4][vpath[end_t - 4]] + - alpha[start_t as usize + 2][vpath[start_t as usize + 2]]) + / (end_t - start_t as usize - 5) as f64; let mut frame = start_orf % 3; if frame == 0 { frame = 3 @@ -687,6 +659,39 @@ fn build_genes( dna_start_t -= 3; } + if whole_genome { + // add refinement of the start codons here + let start_old = start_t as usize; + let mut codon = &seq[start_old..start_old + 3]; + let mut s = 0; + // find the optimal start codon within 30bp up- and downstream of start codon + let mut e_save = 0.0; + let mut s_save = 0; + while !(codon != [T, A, A] || codon != [T, A, G] || codon != [T, G, A]) + && start_old >= 1 + s + 35 + { + if codon != [A, T, G] || codon != [G, T, G] || codon != [T, T, G] { + let utr = &seq[start_old - 1 - s - 30..start_old - 1 - s - 30 + 63]; + let mut freq_sum = 0.0; + for j in 0..utr.len() - 2 { + freq_sum -= local.tr_s[j] + [trinucleotide(utr.get(j..).unwrap()).unwrap_or(0)]; + } + if s == 0 { + e_save = freq_sum; + s_save = 0; + } else if freq_sum < e_save { + e_save = freq_sum; + s_save = -(s as isize); // posivite chain, upstream s_save = -1 * 3 + } + } + s += 3; + codon = &seq[start_old - 1 - s..start_old - 1 - s + 3]; + + dna_start_t += s_save as usize; + } + } + read_prediction.genes.push(gene::Gene { start: dna_start_t, metastart: dna_start_t, @@ -699,12 +704,40 @@ fn build_genes( deleted: delete.clone(), }); } else if codon_start == -1 { + if whole_genome { + // add refinement of the start codons here + // l 946 + let end_old = end_t; + let mut codon = &seq[end_t - 1 - 2..end_t]; + let mut s = 0; + // find the optimal start codon within 30bp up- and downstream of start codon + let mut e_save = 0.0; + let mut s_save = 0; + while !(codon != [T, A, A] || codon != [T, A, G] || codon != [T, G, A]) + && end_old - 2 + s + 35 < seq.len() + { + if codon != [A, T, G] || codon != [G, T, G] || codon != [T, T, G] { + let utr = &seq[end_old - 1 - 2 + s - 30..end_old + s + 30]; + let mut freq_sum = 0.0; + for j in 0..utr.len() - 2 { + // TODO stop1? (their note) + freq_sum -= local.tr_e1[j] + [trinucleotide(utr.get(j..).unwrap()).unwrap_or(0)]; + } + if s == 0 || freq_sum < e_save { + e_save = freq_sum; + s_save = s; // negative chain, s_save = s + } + } + s += 3; + codon = &seq[end_old - 1 - 2 + s..end_old]; + } + + end_t = end_old + s_save; + } + read_prediction.genes.push(gene::Gene { - start: if whole_genome { - dna_start_t - } else { - dna_start_t_withstop - }, + start: dna_start_t_withstop, metastart: dna_start_t, end: end_t, frame: frame, @@ -971,127 +1004,3 @@ fn modify_border_dist(cell: &mut f64, values: &[f64], start_freq: f64) { values[5] * (-1.0 * (start_freq - values[4]).powi(2) / (values[3]).powi(2) / 2.0).exp(); *cell -= (h_kd / (h_kd + r_kd)).max(0.01).min(0.99).ln(); } - -fn extend( - dna: &mut Vec, - left: &mut usize, - right: &mut usize, - forward: bool, - startcodons: [[Nuc; 3]; 3], - stopcodons: [[Nuc; 3]; 3], - seq: &[Nuc], - alpha: &[[f64; 29]], - vpath: &[hmm::State], -) { - if forward { - let mut c = (*left + 6).max(right.saturating_sub(30)); // stop candidate 1-based - while c < (*right + 198).min(seq.len() - 1) - && !stopcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) - { - c += 3; - } - if c < (*right + 198).min(seq.len() - 1) { - c += 3; // include stop codon - match c.cmp(right) { - Less => { - dna.truncate(dna.len() + c - *right); - } - Equal => {} - Greater => { - dna.extend(&seq[*right - 1..c - 1]); - } - } - *right = c; - } - - let mut starts = Vec::new(); - let mut c = right.saturating_sub(6).min(*left + 30); // start candidate 1-based - while c >= 3 - && c > left.saturating_sub(198) - && !stopcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) - { - if startcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) { - starts.push(c); - } - c -= 3; - } - - let mut startc = *left; - let mut maxscore = score(*left, *right, alpha, vpath); - for s in starts.into_iter() { - let nscore = score(s, *right, alpha, vpath); - if f64::INFINITY > nscore && nscore > maxscore { - startc = s; - maxscore = nscore; - } - } - match startc.cmp(left) { - Less => { - dna.splice(0..0, seq[startc - 1..*left - 1].iter().cloned()); - } - Equal => {} - Greater => { - dna.drain(0..startc - *left); - } - } - *left = startc; - } else { - let mut c = right.saturating_sub(6).min(*left + 30); // stop candidate 1-based - while c >= 3 - && c > left.saturating_sub(198) - && !stopcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) - { - c -= 3; - } - if c >= 3 && c > left.saturating_sub(198) { - c -= 3; // include stop codon - match c.cmp(left) { - Less => { - dna.splice(0..0, seq[c - 1..*left - 1].iter().cloned()); - } - Equal => {} - Greater => { - dna.drain(0..c - *left); - } - } - *left = c; - } - - let mut starts = Vec::new(); - let mut c = (*left + 6).max(right.saturating_sub(30)); // start candidate 1-based - while c < (*right + 198).min(seq.len() - 1) - && !stopcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) - { - if startcodons.contains(&[seq[c - 1], seq[c], seq[c + 1]]) { - starts.push(c); - } - c += 3; - } - - let mut startc = *right; - let mut maxscore = score(*left, *right, alpha, vpath); - for s in starts.into_iter() { - let nscore = score(*left, s, alpha, vpath); - if f64::INFINITY > nscore && nscore > maxscore { - startc = s; - maxscore = nscore; - } - } - match startc.cmp(right) { - Less => { - dna.truncate(dna.len() + startc - *right); - } - Equal => {} - Greater => { - dna.extend(&seq[*right - 1..startc - 1]); - } - } - *right = startc; - } -} - -fn score(start: usize, stop: usize, alpha: &[[f64; 29]], vpath: &[hmm::State]) -> f64 { - // start and stop are 1-based (-1), exclude start (+3) and stop (-3) codons - (alpha[stop - 4][vpath[stop - 4]] - alpha[start + 2][vpath[start + 2]]) - / (stop - start - 5) as f64 -} From 025a34abcf003f63c7b6ae7afa7a380e91a15e92 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Fri, 17 Sep 2021 11:54:03 +0200 Subject: [PATCH 04/16] fix ACGT typo to correct complete predictions tool TP FP TN FN prec sens spec NPV MCC FGS 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 FGS+ 2661256 533556 361341 109188 83.3 96.06 40.38 76.79 46.79 prodigal 2488137 117535 745673 313996 95.49 88.79 86.38 70.37 70.36 FGSrs 2650006 517695 371428 126212 83.66 95.45 41.77 74.64 46.59 --- src/viterbi.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/viterbi.rs b/src/viterbi.rs index cf6e5a6..76c4e89 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -356,7 +356,7 @@ pub fn forward( local.tr_e[i + 60 - t][trinucleotide(seq.get(i..).unwrap()).unwrap_or(0)]; } if t < 60 { - start_freq *= 58.0 / (t - 3 + 1) as f64 + start_freq *= 58.0 / (t.saturating_sub(3) + 1) as f64 } modify_border_dist(&mut alpha[t + 2][hmm::State::E], &local.dist_e, start_freq); } @@ -432,7 +432,7 @@ pub fn forward( if t < seq.len() - 2 && (seq[t] == A || seq[t] == G || seq[t] == T) - && seq[t + 1] == A + && seq[t + 1] == T && seq[t + 2] == G { alpha[t][hmm::State::S] = f64::INFINITY; From 137462699b6673329f4759dc441b545d9e483920 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Sun, 19 Sep 2021 23:04:09 +0200 Subject: [PATCH 05/16] include reverse stop codon in meta output tool TP FP TN FN prec sens spec NPV MCC FGS 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 FGS+ 2661256 533556 361341 109188 83.3 96.06 40.38 76.79 46.79 prodigal 2488137 117535 745673 313996 95.49 88.79 86.38 70.37 70.36 FGSrs 2652412 517860 371290 123779 83.67 95.54 41.76 75.0 46.78 --- src/gene.rs | 7 +++---- src/viterbi.rs | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/gene.rs b/src/gene.rs index f51710d..ce0dbef 100644 --- a/src/gene.rs +++ b/src/gene.rs @@ -74,7 +74,6 @@ impl ReadPrediction { pub struct Gene { pub start: usize, - pub metastart: usize, pub end: usize, pub frame: usize, pub score: f64, @@ -89,7 +88,7 @@ impl Gene { buf.append( &mut format!( "{}\t{}\t{}\t{}\t{:.6}\tI:{}\tD:{}\n", - self.metastart, + self.start, self.end, if self.forward_strand { '+' } else { '-' }, self.frame, @@ -112,12 +111,12 @@ impl Gene { &mut format!( "{}\tFGS\tCDS\t{}\t{}\t.\t{}\t{}\tID={}_{}_{}_{};product=predicted protein\n", head, - self.metastart, + self.start, self.end, if self.forward_strand { '+' } else { '-' }, self.frame - 1, head, - self.metastart, + self.start, self.end, if self.forward_strand { '+' } else { '-' } ) diff --git a/src/viterbi.rs b/src/viterbi.rs index 76c4e89..3c5c3ae 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -694,7 +694,6 @@ fn build_genes( read_prediction.genes.push(gene::Gene { start: dna_start_t, - metastart: dna_start_t, end: end_t, frame: frame, score: final_score, @@ -738,7 +737,6 @@ fn build_genes( read_prediction.genes.push(gene::Gene { start: dna_start_t_withstop, - metastart: dna_start_t, end: end_t, frame: frame, score: final_score, From 654424ef480ba343384b29b4e5fd1d442ec6c532 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Tue, 21 Sep 2021 11:00:26 +0200 Subject: [PATCH 06/16] unfix start1/stop1 filenames Ratings: tool TP FP TN FN prec sens spec NPV MCC FGS 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 FGS+ 2661256 533556 361341 109188 83.3 96.06 40.38 76.79 46.79 prodigal 2488137 117535 745673 313996 95.49 88.79 86.38 70.37 70.36 FGSrs 2661082 518282 368500 117477 83.7 95.77 41.55 75.83 47.14 --- src/hmm.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hmm.rs b/src/hmm.rs index d6279b7..3fc433b 100644 --- a/src/hmm.rs +++ b/src/hmm.rs @@ -149,8 +149,8 @@ pub fn get_train_from_file( read_noncoding(&mut locals, train_dir.join("noncoding"))?; read_start(&mut locals, train_dir.join("start"))?; read_stop(&mut locals, train_dir.join("stop"))?; - read_start1(&mut locals, train_dir.join("start1"))?; - read_stop1(&mut locals, train_dir.join("stop1"))?; + read_start1(&mut locals, train_dir.join("stop1"))?; // keep FGS naming scheme + read_stop1(&mut locals, train_dir.join("start1"))?; // keep FGS naming scheme read_pwm(&mut locals, train_dir.join("pwm"))?; Ok((global, locals)) From 0d20f0e51ee4f4d5cdc8ce83032d8889af529169 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Tue, 21 Sep 2021 17:43:06 +0200 Subject: [PATCH 07/16] infinity is sometimes smaller in FGS tool TP FP TN FN prec sens spec NPV MCC FGS 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 FGS+ 2661256 533556 361341 109188 83.3 96.06 40.38 76.79 46.79 prodigal 2488137 117535 745673 313996 95.49 88.79 86.38 70.37 70.36 FGSrs 2661082 518282 368500 117477 83.7 95.77 41.55 75.83 47.14 --- src/viterbi.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/viterbi.rs b/src/viterbi.rs index 3c5c3ae..97b629a 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -839,7 +839,7 @@ fn from_s_to_m( to: usize, ) { let temp_alpha = alpha[t - 1][hmm::State::S] - local.e_m[0][from2][to]; - if temp_alpha < alpha[t][hmm::State::M1] { + if temp_alpha <= alpha[t][hmm::State::M1] { alpha[t][hmm::State::M1] = temp_alpha; path[t][hmm::State::M1] = Some(hmm::State::S); } From ddde02ba323b820ce23133aaceabba9a6fbcbb06 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Tue, 21 Sep 2021 17:44:51 +0200 Subject: [PATCH 08/16] dropme: start with start states tool TP FP TN FN prec sens spec NPV MCC FGS 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 FGS+ 2661256 533556 361341 109188 83.3 96.06 40.38 76.79 46.79 prodigal 2488137 117535 745673 313996 95.49 88.79 86.38 70.37 70.36 FGSrs 2661082 518282 368500 117477 83.7 95.77 41.55 75.83 47.14 --- src/viterbi.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/viterbi.rs b/src/viterbi.rs index 97b629a..994d824 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -33,7 +33,7 @@ pub fn forward( for _ in 0..seq.len() { alpha.push([0.0; hmm::State::COUNT]); - path.push([None; hmm::State::COUNT]); + path.push([Some(hmm::State::S); hmm::State::COUNT]); } alpha[0].copy_from_slice(&global.pi); for i in &mut alpha[0] { From 22f2f201197d87404b5845eea4ccfc35a0ca87c8 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Wed, 22 Sep 2021 17:11:33 +0200 Subject: [PATCH 09/16] negate strcmp tool TP FP TN FN prec sens spec NPV MCC FGS 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 FGS+ 2661256 533556 361341 109188 83.3 96.06 40.38 76.79 46.79 prodigal 2488137 117535 745673 313996 95.49 88.79 86.38 70.37 70.36 FGSrs 2504714 280484 607613 272530 89.93 90.19 68.42 69.04 58.78 --- src/viterbi.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/viterbi.rs b/src/viterbi.rs index 994d824..c1ae16d 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -662,15 +662,15 @@ fn build_genes( if whole_genome { // add refinement of the start codons here let start_old = start_t as usize; - let mut codon = &seq[start_old..start_old + 3]; + let mut codon = &seq[start_old - 1..start_old + 2]; let mut s = 0; // find the optimal start codon within 30bp up- and downstream of start codon let mut e_save = 0.0; let mut s_save = 0; - while !(codon != [T, A, A] || codon != [T, A, G] || codon != [T, G, A]) + while !(codon == [T, A, A] || codon == [T, A, G] || codon == [T, G, A]) && start_old >= 1 + s + 35 { - if codon != [A, T, G] || codon != [G, T, G] || codon != [T, T, G] { + if codon == [A, T, G] || codon == [G, T, G] || codon == [T, T, G] { let utr = &seq[start_old - 1 - s - 30..start_old - 1 - s - 30 + 63]; let mut freq_sum = 0.0; for j in 0..utr.len() - 2 { @@ -687,9 +687,8 @@ fn build_genes( } s += 3; codon = &seq[start_old - 1 - s..start_old - 1 - s + 3]; - - dna_start_t += s_save as usize; } + dna_start_t += s_save as usize; } read_prediction.genes.push(gene::Gene { @@ -712,10 +711,10 @@ fn build_genes( // find the optimal start codon within 30bp up- and downstream of start codon let mut e_save = 0.0; let mut s_save = 0; - while !(codon != [T, A, A] || codon != [T, A, G] || codon != [T, G, A]) + while !(codon == [T, T, A] || codon == [C, T, A] || codon == [T, C, A]) && end_old - 2 + s + 35 < seq.len() { - if codon != [A, T, G] || codon != [G, T, G] || codon != [T, T, G] { + if codon == [C, A, T] || codon == [C, A, C] || codon == [C, A, A] { let utr = &seq[end_old - 1 - 2 + s - 30..end_old + s + 30]; let mut freq_sum = 0.0; for j in 0..utr.len() - 2 { @@ -729,7 +728,7 @@ fn build_genes( } } s += 3; - codon = &seq[end_old - 1 - 2 + s..end_old]; + codon = &seq[end_old - 1 - 2 + s..end_old + s]; } end_t = end_old + s_save; From 692c10c1f800c379fbc85cc629cb9e3ab68ee4c8 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Wed, 22 Sep 2021 17:55:20 +0200 Subject: [PATCH 10/16] invert s_save to avoid negative numbers --- src/viterbi.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/viterbi.rs b/src/viterbi.rs index c1ae16d..f3c45ee 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -682,13 +682,13 @@ fn build_genes( s_save = 0; } else if freq_sum < e_save { e_save = freq_sum; - s_save = -(s as isize); // posivite chain, upstream s_save = -1 * 3 + s_save = s; // posivite chain, upstream s_save = -1 * 3 } } s += 3; codon = &seq[start_old - 1 - s..start_old - 1 - s + 3]; } - dna_start_t += s_save as usize; + dna_start_t -= s_save; } read_prediction.genes.push(gene::Gene { From 2a3ebcdc2a10e929ace4f176b0c014a8dcfa8e08 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Wed, 22 Sep 2021 17:55:49 +0200 Subject: [PATCH 11/16] compare to negative strand for dna_start_t_withstop tool TP FP TN FN prec sens spec NPV MCC FGS 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 FGS+ 2661256 533556 361341 109188 83.3 96.06 40.38 76.79 46.79 prodigal 2488137 117535 745673 313996 95.49 88.79 86.38 70.37 70.36 FGSrs 2494195 231257 656879 283010 91.51 89.81 73.96 69.89 62.58 --- src/viterbi.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/viterbi.rs b/src/viterbi.rs index f3c45ee..c8cca22 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -616,7 +616,7 @@ fn build_genes( dna.push(seq[t]); dna_start_t_withstop = t + 1; dna_start_t = t + 1; - if vpath[t] == hmm::State::M1 || vpath[t] == hmm::State::M4r { + if vpath[t] == hmm::State::M1r || vpath[t] == hmm::State::M4r { if t > 2 { dna_start_t_withstop = t - 2; } From 694db51ff703b343f528559d7db8aa2fff45363d Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Thu, 23 Sep 2021 00:41:34 +0200 Subject: [PATCH 12/16] add quality measuring scripts --- .gitignore | 5 +++ meta/evaluation/.gitignore | 2 ++ meta/evaluation/README.md | 49 +++++++++++++++++++++++++++ meta/evaluation/annotations.py | 18 ++++++++++ meta/evaluation/rates.py | 62 ++++++++++++++++++++++++++++++++++ meta/out-to-gff.awk | 11 ++++++ 6 files changed, 147 insertions(+) create mode 100644 meta/evaluation/.gitignore create mode 100644 meta/evaluation/README.md create mode 100644 meta/evaluation/annotations.py create mode 100755 meta/evaluation/rates.py create mode 100644 meta/out-to-gff.awk diff --git a/.gitignore b/.gitignore index 1948847..26b21af 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ /target FragGeneScan FGS+ +prodigal +*.ffn +*.faa +*.gff +*.csv diff --git a/meta/evaluation/.gitignore b/meta/evaluation/.gitignore new file mode 100644 index 0000000..89e0fe0 --- /dev/null +++ b/meta/evaluation/.gitignore @@ -0,0 +1,2 @@ +*.fasta +*.txt diff --git a/meta/evaluation/README.md b/meta/evaluation/README.md new file mode 100644 index 0000000..3a8f8c9 --- /dev/null +++ b/meta/evaluation/README.md @@ -0,0 +1,49 @@ +# Evaluation of FGS, FGSrs, FGS+, Prodigal on whole genomes + +Source assembly: https://www.ebi.ac.uk/ena/browser/view/GCA_001628815?show=chromosomes + +The 'FASTA' download `ena_data_20210917-1328.fasta` is the complete assembly. + +The 'TEXT' download `ena_data_20210917-1328.txt` also contains annotated genes. + +## Create the annotations and lengths files + +Execute `annotations.py`. + +## Create the FGS/FGS+ files (from .aa) + +(swap directories to execute these) + +```sh +cd path/to/FGS +./FragGeneScan -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS -t complete -w 1 +./FGS+ -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS+ -t complete -w 1 +cd - +rm FGS.out FGS.ffn +sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS.faa > FGS.csv +sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS+.faa > FGS+.csv +``` + +## Create the FGSrs/Prodigal files (from .gff) + +```sh +FragGeneScanRs -s ena_data_20210917-1328.fasta -g FGSrs.gff -t complete -w 1 +prodigal -i ena_data_20210917-1328.fasta -p meta -f gff -o prodigal.gff +grep -v '^#' FGSrs.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > FGSrs.csv +grep -v '^#' prodigal.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > prodigal.csv +``` + +## Print comparison table + +Execute `rates.py`. + +## Timings for these predictions using [hyperfine](https://github.com/sharkdp/hyperfine) + +Run in the FGS or FGS+ directory (for the training files). + +```sh +hyperfine 'FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1' \ + 'FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1' \ + 'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' \ + 'prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -p meta -f gff -o meta/evaluation/prodigal.gff' +``` diff --git a/meta/evaluation/annotations.py b/meta/evaluation/annotations.py new file mode 100644 index 0000000..c76c8d1 --- /dev/null +++ b/meta/evaluation/annotations.py @@ -0,0 +1,18 @@ +with open('ena_data_20210917-1328.txt') as f, open('annotations.csv', 'w') as a, open('readlengths.csv', 'w') as l: + for line in f: + if line.startswith('AC'): + accession = line[2:].strip()[:-1] + elif line.startswith('FT gene'): + span = line[9:].strip() + if span.startswith('complement('): + span = span[11:-1] + strand = '-' + else: + strand = '+' + start, end = span.split('..') + print(accession, start, end, strand, sep=',', file=a) + elif line.startswith('SQ'): + parts = line[2:].strip().split(' ') + assert parts[0] == "Sequence" + print(accession, parts[1], sep=',', file=l) + assert parts[2].startswith("BP") diff --git a/meta/evaluation/rates.py b/meta/evaluation/rates.py new file mode 100755 index 0000000..cd676bd --- /dev/null +++ b/meta/evaluation/rates.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +from collections import namedtuple +from operator import itemgetter + +Annotation = namedtuple('Annotation', ['name', 'start', 'end', 'strand']) +Event = namedtuple('Event', ['location', 'annotation', 'prediction']) + +def parse_readlengths(filename): + with open(filename) as f: + for line in f: + name, length = line.strip().split(',') + yield name, int(length) + +def parse_csv(filename): + with open(filename) as f: + for line in f: + name, start, end, strand = line.strip().split(',') + yield Annotation(name, int(start), int(end), 1 if strand == '+' else -1) + +def events(read_lengths, annotations, predictions): + names = dict() + total = 0 + for name, length in parse_readlengths(read_lengths): + names[name] = total + total += length + + for name, start, end, strand in parse_csv(annotations): + yield Event(names[name] + start, strand, None) + yield Event(names[name] + end, 0, None) + + for name, start, end, strand in parse_csv(predictions): + yield Event(names[name] + start, None, strand) + yield Event(names[name] + end, None, 0) + +def rates(read_lengths, annotations, predictions): + rates = dict(tp=0, fp=0, tn=0, fn=0) + cl = 0 # current location + ca = 0 # current annotation + cp = 0 # current prediction + for l, a, p in sorted(events(read_lengths, annotations, predictions), key=itemgetter(0)): + if ca == 0 and cp == 0: + rates['tn'] += l - cl + elif ca == cp: + rates['tp'] += l - cl + elif ca == 0 and cp != 0: + rates['fp'] += l - cl + elif ca != 0 and cp == 0: + rates['fn'] += l - cl + else: # different strands + rates['fp'] += l - cl + + if a is not None: ca = a + if p is not None: cp = p + cl = l + return rates + +row = '{:<15} {:>10} {:>10} {:>10} {:>10} {:>10.4}{:>10.4}{:>10.4}{:>10.4}{:>10.4}' +print(row.format('tool', 'TP', 'FP', 'TN', 'FN', 'precision', 'sensitivity', 'specificity', 'NPV', 'MCC')) +for tool in ['FGS', 'FGS+', 'prodigal', 'FGSrs']: + r = rates('readlengths.csv', 'annotations.csv', f'{tool}.csv') + tp, fp, tn, fn = r['tp'], r['fp'], r['tn'], r['fn'] + print(row.format(tool, tp, fp, tn, fn, 100 * tp / (tp + fp), 100 * tp / (tp + fn), 100 * tn / (tn + fp), 100 * tn / (tn + fn), 100 * (tp * tn - fp * fn) / ((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))**0.5)) diff --git a/meta/out-to-gff.awk b/meta/out-to-gff.awk new file mode 100644 index 0000000..41b02ef --- /dev/null +++ b/meta/out-to-gff.awk @@ -0,0 +1,11 @@ +BEGIN { print "##gff-version 3"; } +{ + s = substr($1, 1, 1) + if (s == ">") { + seqid = substr($1, 2) + } else { + s = split($0, t, "\t") + id = "ID=" seqid "_" t[1] "_" t[2] "_" t[3] ";product=predicted protein" + print seqid "\tFGS\tCDS\t" t[1] "\t" t[2] "\t.\t" t[3] "\t" int(t[4] - 1) "\t" id + } +} \ No newline at end of file From b5bdd51f02ad922bf5287eaff920ce8af075b770 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Thu, 23 Sep 2021 07:38:59 +0200 Subject: [PATCH 13/16] add last nights benchmarks to the evaluation README --- meta/evaluation/README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/meta/evaluation/README.md b/meta/evaluation/README.md index 3a8f8c9..ff601d6 100644 --- a/meta/evaluation/README.md +++ b/meta/evaluation/README.md @@ -47,3 +47,27 @@ hyperfine 'FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/ 'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' \ 'prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -p meta -f gff -o meta/evaluation/prodigal.gff' ``` + +``` +Benchmark #1: ./FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1 + Time (mean ± σ): 2.659 s ± 0.004 s [User: 2.403 s, System: 0.250 s] + Range (min … max): 2.653 s … 2.664 s 20 runs + +Benchmark #2: FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1 + Time (mean ± σ): 1.078 s ± 0.002 s [User: 882.0 ms, System: 193.8 ms] + Range (min … max): 1.074 s … 1.080 s 20 runs + +Benchmark #3: ./prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -p meta -f gff -o meta/evaluation/prodigal.gff + Time (mean ± σ): 40.209 s ± 0.070 s [User: 40.132 s, System: 0.022 s] + Range (min … max): 40.076 s … 40.334 s 20 runs + +Benchmark #4: ./FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1 + Time (mean ± σ): 191.019 s ± 6.262 s [User: 190.494 s, System: 0.313 s] + Range (min … max): 179.236 s … 200.262 s 20 runs + +Summary + 'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' ran + 2.47 ± 0.01 times faster than './FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1' + 37.30 ± 0.09 times faster than './prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -p meta -f gff -o meta/evaluation/prodigal.gff' + 177.21 ± 5.82 times faster than './FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1' +``` From 7eac42657f0d6af4000d19105fcea0fd836a42dd Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Thu, 23 Sep 2021 15:13:21 +0200 Subject: [PATCH 14/16] include start and stop codons in output --- src/viterbi.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/viterbi.rs b/src/viterbi.rs index c8cca22..43608a4 100644 --- a/src/viterbi.rs +++ b/src/viterbi.rs @@ -619,6 +619,7 @@ fn build_genes( if vpath[t] == hmm::State::M1r || vpath[t] == hmm::State::M4r { if t > 2 { dna_start_t_withstop = t - 2; + dna.splice(0..0, seq[t - 3..t].iter().cloned()); } } @@ -657,6 +658,7 @@ fn build_genes( if codon_start == 1 { if start_t == dna_start_t as isize - 3 { dna_start_t -= 3; + dna.splice(0..0, seq[dna_start_t - 1..dna_start_t + 2].iter().cloned()); } if whole_genome { @@ -689,8 +691,19 @@ fn build_genes( codon = &seq[start_old - 1 - s..start_old - 1 - s + 3]; } dna_start_t -= s_save; + dna.splice( + 0..0, + seq[dna_start_t - 1..dna_start_t + s_save - 1] + .iter() + .cloned(), + ); } + // add final codon + dna.push(seq[end_t - 3]); + dna.push(seq[end_t - 2]); + dna.push(seq[end_t - 1]); + read_prediction.genes.push(gene::Gene { start: dna_start_t, end: end_t, @@ -734,6 +747,11 @@ fn build_genes( end_t = end_old + s_save; } + // add final codon + dna.push(seq[end_t - 3]); + dna.push(seq[end_t - 2]); + dna.push(seq[end_t - 1]); + read_prediction.genes.push(gene::Gene { start: dna_start_t_withstop, end: end_t, From b2fa61f29eec8515952d81a825ea1a76edc1b89a Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Mon, 27 Sep 2021 10:23:16 +0200 Subject: [PATCH 15/16] allow empty sequences --- src/bin/FragGeneScanRs.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/bin/FragGeneScanRs.rs b/src/bin/FragGeneScanRs.rs index d59bbd1..c3e5709 100644 --- a/src/bin/FragGeneScanRs.rs +++ b/src/bin/FragGeneScanRs.rs @@ -23,6 +23,7 @@ use rayon::ThreadPoolBuilder; extern crate frag_gene_scan_rs; use frag_gene_scan_rs::dna::{count_cg_content, Nuc}; +use frag_gene_scan_rs::gene; use frag_gene_scan_rs::hmm; use frag_gene_scan_rs::viterbi::viterbi; @@ -236,13 +237,17 @@ fn run( let fasta::OwnedRecord { mut head, seq } = record?; head = head.into_iter().take_while(u8::is_ascii_graphic).collect(); let nseq: Vec = seq.into_iter().map(Nuc::from).collect(); - let read_prediction = viterbi( - &global, - &locals[count_cg_content(&nseq)], - head, - nseq, - whole_genome, - ); + let read_prediction = if nseq.is_empty() { + gene::ReadPrediction::new(head) + } else { + viterbi( + &global, + &locals[count_cg_content(&nseq)], + head, + nseq, + whole_genome, + ) + }; if meta_buffer.is_some() { read_prediction.meta(&mut metabuf)?; } From 437ef43c20e124fa014382e6eeb5b7a01c347ed9 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Wed, 29 Sep 2021 14:01:21 +0200 Subject: [PATCH 16/16] run prodigal without meta option --- meta/evaluation/README.md | 38 ++++++++++++++++---------------------- meta/evaluation/rates.py | 19 ++++++++++++++++--- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/meta/evaluation/README.md b/meta/evaluation/README.md index ff601d6..6ec1e10 100644 --- a/meta/evaluation/README.md +++ b/meta/evaluation/README.md @@ -28,7 +28,7 @@ sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS+ ```sh FragGeneScanRs -s ena_data_20210917-1328.fasta -g FGSrs.gff -t complete -w 1 -prodigal -i ena_data_20210917-1328.fasta -p meta -f gff -o prodigal.gff +prodigal -i ena_data_20210917-1328.fasta -f gff -o prodigal.gff grep -v '^#' FGSrs.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > FGSrs.csv grep -v '^#' prodigal.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > prodigal.csv ``` @@ -45,29 +45,23 @@ Run in the FGS or FGS+ directory (for the training files). hyperfine 'FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1' \ 'FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1' \ 'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' \ - 'prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -p meta -f gff -o meta/evaluation/prodigal.gff' + 'prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff' ``` ``` Benchmark #1: ./FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1 - Time (mean ± σ): 2.659 s ± 0.004 s [User: 2.403 s, System: 0.250 s] - Range (min … max): 2.653 s … 2.664 s 20 runs - -Benchmark #2: FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1 - Time (mean ± σ): 1.078 s ± 0.002 s [User: 882.0 ms, System: 193.8 ms] - Range (min … max): 1.074 s … 1.080 s 20 runs - -Benchmark #3: ./prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -p meta -f gff -o meta/evaluation/prodigal.gff - Time (mean ± σ): 40.209 s ± 0.070 s [User: 40.132 s, System: 0.022 s] - Range (min … max): 40.076 s … 40.334 s 20 runs - -Benchmark #4: ./FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1 - Time (mean ± σ): 191.019 s ± 6.262 s [User: 190.494 s, System: 0.313 s] - Range (min … max): 179.236 s … 200.262 s 20 runs - -Summary - 'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' ran - 2.47 ± 0.01 times faster than './FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1' - 37.30 ± 0.09 times faster than './prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -p meta -f gff -o meta/evaluation/prodigal.gff' - 177.21 ± 5.82 times faster than './FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1' + Time (mean ± σ): 3.797 s ± 0.006 s [User: 3.413 s, System: 0.348 s] + Range (min … max): 3.792 s … 3.807 s 5 runs + +Benchmark #2: ./FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1 + Time (mean ± σ): 369.979 s ± 25.774 s [User: 367.679 s, System: 0.517 s] + Range (min … max): 353.713 s … 415.649 s 5 runs + +Benchmark #1: FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1 + Time (mean ± σ): 1.703 s ± 0.014 s [User: 1.395 s, System: 0.275 s] + Range (min … max): 1.684 s … 1.719 s 5 runs + +Benchmark #4: prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff + Time (mean ± σ): 8.533 s ± 0.038 s [User: 8.453 s, System: 0.047 s] + Range (min … max): 8.493 s … 8.573 s 5 runs ``` diff --git a/meta/evaluation/rates.py b/meta/evaluation/rates.py index cd676bd..a0ddfb8 100755 --- a/meta/evaluation/rates.py +++ b/meta/evaluation/rates.py @@ -54,9 +54,22 @@ def rates(read_lengths, annotations, predictions): cl = l return rates -row = '{:<15} {:>10} {:>10} {:>10} {:>10} {:>10.4}{:>10.4}{:>10.4}{:>10.4}{:>10.4}' -print(row.format('tool', 'TP', 'FP', 'TN', 'FN', 'precision', 'sensitivity', 'specificity', 'NPV', 'MCC')) +body = '{:<10}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}' +head = '{:<10}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}' +print(head.format('tool', 'TP', 'FP', 'TN', 'FN', 'precision', 'sensitivity', 'specificity', 'NPV', 'MCC')) for tool in ['FGS', 'FGS+', 'prodigal', 'FGSrs']: r = rates('readlengths.csv', 'annotations.csv', f'{tool}.csv') tp, fp, tn, fn = r['tp'], r['fp'], r['tn'], r['fn'] - print(row.format(tool, tp, fp, tn, fn, 100 * tp / (tp + fp), 100 * tp / (tp + fn), 100 * tn / (tn + fp), 100 * tn / (tn + fn), 100 * (tp * tn - fp * fn) / ((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))**0.5)) + t = tp + fp + tn + fn + print(body.format( + tool, + tp / t, + fp / t, + tn / t, + fn / t, + tp / (tp + fp), + tp / (tp + fn), + tn / (tn + fp), + tn / (tn + fn), + (tp * tn - fp * fn) / ((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))**0.5 + ))