Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extend genes in complete genomes to start/stop codons #5

Merged
merged 16 commits into from
Oct 6, 2021
Merged
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,8 @@
/target
FragGeneScan
FGS+
prodigal
*.ffn
*.faa
*.gff
*.csv
2 changes: 2 additions & 0 deletions meta/evaluation/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.fasta
*.txt
67 changes: 67 additions & 0 deletions meta/evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Evaluation of FGS, FGSrs, FGS+, Prodigal on whole genomes

Source assembly: https://www.ebi.ac.uk/ena/browser/view/GCA_001628815?show=chromosomes

The 'FASTA' download `ena_data_20210917-1328.fasta` is the complete assembly.

The 'TEXT' download `ena_data_20210917-1328.txt` also contains annotated genes.

## Create the annotations and lengths files

Execute `annotations.py`.

## Create the FGS/FGS+ files (from .aa)

(swap directories to execute these)

```sh
cd path/to/FGS
./FragGeneScan -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS -t complete -w 1
./FGS+ -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS+ -t complete -w 1
cd -
rm FGS.out FGS.ffn
sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS.faa > FGS.csv
sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS+.faa > FGS+.csv
```

## Create the FGSrs/Prodigal files (from .gff)

```sh
FragGeneScanRs -s ena_data_20210917-1328.fasta -g FGSrs.gff -t complete -w 1
prodigal -i ena_data_20210917-1328.fasta -f gff -o prodigal.gff
grep -v '^#' FGSrs.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > FGSrs.csv
grep -v '^#' prodigal.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > prodigal.csv
```

## Print comparison table

Execute `rates.py`.

## Timings for these predictions using [hyperfine](https://github.com/sharkdp/hyperfine)

Run in the FGS or FGS+ directory (for the training files).

```sh
hyperfine 'FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1' \
'FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1' \
'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' \
'prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff'
```

```
Benchmark #1: ./FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1
Time (mean ± σ): 3.797 s ± 0.006 s [User: 3.413 s, System: 0.348 s]
Range (min … max): 3.792 s … 3.807 s 5 runs

Benchmark #2: ./FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1
Time (mean ± σ): 369.979 s ± 25.774 s [User: 367.679 s, System: 0.517 s]
Range (min … max): 353.713 s … 415.649 s 5 runs

Benchmark #1: FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1
Time (mean ± σ): 1.703 s ± 0.014 s [User: 1.395 s, System: 0.275 s]
Range (min … max): 1.684 s … 1.719 s 5 runs

Benchmark #4: prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff
Time (mean ± σ): 8.533 s ± 0.038 s [User: 8.453 s, System: 0.047 s]
Range (min … max): 8.493 s … 8.573 s 5 runs
```
18 changes: 18 additions & 0 deletions meta/evaluation/annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
with open('ena_data_20210917-1328.txt') as f, open('annotations.csv', 'w') as a, open('readlengths.csv', 'w') as l:
for line in f:
if line.startswith('AC'):
accession = line[2:].strip()[:-1]
elif line.startswith('FT gene'):
span = line[9:].strip()
if span.startswith('complement('):
span = span[11:-1]
strand = '-'
else:
strand = '+'
start, end = span.split('..')
print(accession, start, end, strand, sep=',', file=a)
elif line.startswith('SQ'):
parts = line[2:].strip().split(' ')
assert parts[0] == "Sequence"
print(accession, parts[1], sep=',', file=l)
assert parts[2].startswith("BP")
75 changes: 75 additions & 0 deletions meta/evaluation/rates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python
from collections import namedtuple
from operator import itemgetter

Annotation = namedtuple('Annotation', ['name', 'start', 'end', 'strand'])
Event = namedtuple('Event', ['location', 'annotation', 'prediction'])

def parse_readlengths(filename):
with open(filename) as f:
for line in f:
name, length = line.strip().split(',')
yield name, int(length)

def parse_csv(filename):
with open(filename) as f:
for line in f:
name, start, end, strand = line.strip().split(',')
yield Annotation(name, int(start), int(end), 1 if strand == '+' else -1)

def events(read_lengths, annotations, predictions):
names = dict()
total = 0
for name, length in parse_readlengths(read_lengths):
names[name] = total
total += length

for name, start, end, strand in parse_csv(annotations):
yield Event(names[name] + start, strand, None)
yield Event(names[name] + end, 0, None)

for name, start, end, strand in parse_csv(predictions):
yield Event(names[name] + start, None, strand)
yield Event(names[name] + end, None, 0)

def rates(read_lengths, annotations, predictions):
rates = dict(tp=0, fp=0, tn=0, fn=0)
cl = 0 # current location
ca = 0 # current annotation
cp = 0 # current prediction
for l, a, p in sorted(events(read_lengths, annotations, predictions), key=itemgetter(0)):
if ca == 0 and cp == 0:
rates['tn'] += l - cl
elif ca == cp:
rates['tp'] += l - cl
elif ca == 0 and cp != 0:
rates['fp'] += l - cl
elif ca != 0 and cp == 0:
rates['fn'] += l - cl
else: # different strands
rates['fp'] += l - cl

if a is not None: ca = a
if p is not None: cp = p
cl = l
return rates

body = '{:<10}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}'
head = '{:<10}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}'
print(head.format('tool', 'TP', 'FP', 'TN', 'FN', 'precision', 'sensitivity', 'specificity', 'NPV', 'MCC'))
for tool in ['FGS', 'FGS+', 'prodigal', 'FGSrs']:
r = rates('readlengths.csv', 'annotations.csv', f'{tool}.csv')
tp, fp, tn, fn = r['tp'], r['fp'], r['tn'], r['fn']
t = tp + fp + tn + fn
print(body.format(
tool,
tp / t,
fp / t,
tn / t,
fn / t,
tp / (tp + fp),
tp / (tp + fn),
tn / (tn + fp),
tn / (tn + fn),
(tp * tn - fp * fn) / ((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))**0.5
))
11 changes: 11 additions & 0 deletions meta/out-to-gff.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
BEGIN { print "##gff-version 3"; }
{
s = substr($1, 1, 1)
if (s == ">") {
seqid = substr($1, 2)
} else {
s = split($0, t, "\t")
id = "ID=" seqid "_" t[1] "_" t[2] "_" t[3] ";product=predicted protein"
print seqid "\tFGS\tCDS\t" t[1] "\t" t[2] "\t.\t" t[3] "\t" int(t[4] - 1) "\t" id
}
}
19 changes: 12 additions & 7 deletions src/bin/FragGeneScanRs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use rayon::ThreadPoolBuilder;

extern crate frag_gene_scan_rs;
use frag_gene_scan_rs::dna::{count_cg_content, Nuc};
use frag_gene_scan_rs::gene;
use frag_gene_scan_rs::hmm;
use frag_gene_scan_rs::viterbi::viterbi;

Expand Down Expand Up @@ -236,13 +237,17 @@ fn run<R: Read + Send, W: WritingBuffer + Send>(
let fasta::OwnedRecord { mut head, seq } = record?;
head = head.into_iter().take_while(u8::is_ascii_graphic).collect();
let nseq: Vec<Nuc> = seq.into_iter().map(Nuc::from).collect();
let read_prediction = viterbi(
&global,
&locals[count_cg_content(&nseq)],
head,
nseq,
whole_genome,
);
let read_prediction = if nseq.is_empty() {
gene::ReadPrediction::new(head)
} else {
viterbi(
&global,
&locals[count_cg_content(&nseq)],
head,
nseq,
whole_genome,
)
};
if meta_buffer.is_some() {
read_prediction.meta(&mut metabuf)?;
}
Expand Down
7 changes: 3 additions & 4 deletions src/gene.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ impl ReadPrediction {

pub struct Gene {
pub start: usize,
pub metastart: usize,
pub end: usize,
pub frame: usize,
pub score: f64,
Expand All @@ -89,7 +88,7 @@ impl Gene {
buf.append(
&mut format!(
"{}\t{}\t{}\t{}\t{:.6}\tI:{}\tD:{}\n",
self.metastart,
self.start,
self.end,
if self.forward_strand { '+' } else { '-' },
self.frame,
Expand All @@ -112,12 +111,12 @@ impl Gene {
&mut format!(
"{}\tFGS\tCDS\t{}\t{}\t.\t{}\t{}\tID={}_{}_{}_{};product=predicted protein\n",
head,
self.metastart,
self.start,
self.end,
if self.forward_strand { '+' } else { '-' },
self.frame - 1,
head,
self.metastart,
self.start,
self.end,
if self.forward_strand { '+' } else { '-' }
)
Expand Down
4 changes: 2 additions & 2 deletions src/hmm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ pub fn get_train_from_file(
read_noncoding(&mut locals, train_dir.join("noncoding"))?;
read_start(&mut locals, train_dir.join("start"))?;
read_stop(&mut locals, train_dir.join("stop"))?;
read_start1(&mut locals, train_dir.join("start1"))?;
read_stop1(&mut locals, train_dir.join("stop1"))?;
read_start1(&mut locals, train_dir.join("stop1"))?; // keep FGS naming scheme
read_stop1(&mut locals, train_dir.join("start1"))?; // keep FGS naming scheme
read_pwm(&mut locals, train_dir.join("pwm"))?;

Ok((global, locals))
Expand Down
Loading