-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix detection of start/stop codons for 'complete' input
* fix/complete-start: run prodigal without meta option allow empty sequences include start and stop codons in output add last nights benchmarks to the evaluation README add quality measuring scripts compare to negative strand for dna_start_t_withstop invert s_save to avoid negative numbers negate strcmp dropme: start with start states infinity is sometimes smaller in FGS unfix start1/stop1 filenames include reverse stop codon in meta output fix ACGT typo to correct complete predictions revert "extend genes in complete genomes to start/stop codons" use corrected dna_start_t extend genes in complete genomes to start/stop codons
- Loading branch information
Showing
10 changed files
with
228 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,8 @@ | ||
/target | ||
FragGeneScan | ||
FGS+ | ||
prodigal | ||
*.ffn | ||
*.faa | ||
*.gff | ||
*.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
*.fasta | ||
*.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# Evaluation of FGS, FGSrs, FGS+, Prodigal on whole genomes | ||
|
||
Source assembly: https://www.ebi.ac.uk/ena/browser/view/GCA_001628815?show=chromosomes | ||
|
||
The 'FASTA' download `ena_data_20210917-1328.fasta` is the complete assembly. | ||
|
||
The 'TEXT' download `ena_data_20210917-1328.txt` also contains annotated genes. | ||
|
||
## Create the annotations and lengths files | ||
|
||
Execute `annotations.py`. | ||
|
||
## Create the FGS/FGS+ files (from .aa) | ||
|
||
(swap directories to execute these) | ||
|
||
```sh | ||
cd path/to/FGS | ||
./FragGeneScan -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS -t complete -w 1 | ||
./FGS+ -s ~-/ena_data_20210917-1328.fasta -o ~-/FGS+ -t complete -w 1 | ||
cd - | ||
rm FGS.out FGS.ffn | ||
sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS.faa > FGS.csv | ||
sed -n 's/^>ENA|\([^|]*\)|.*_\([0-9]*\)_\([0-9]*\)_\([+-]\)$/\1,\2,\3,\4/p' FGS+.faa > FGS+.csv | ||
``` | ||
|
||
## Create the FGSrs/Prodigal files (from .gff) | ||
|
||
```sh | ||
FragGeneScanRs -s ena_data_20210917-1328.fasta -g FGSrs.gff -t complete -w 1 | ||
prodigal -i ena_data_20210917-1328.fasta -f gff -o prodigal.gff | ||
grep -v '^#' FGSrs.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > FGSrs.csv | ||
grep -v '^#' prodigal.gff | tr '\t' ',' | cut -d, -f1,4,5,7 | sed 's/ENA|//;s/|[^,]*,/,/' > prodigal.csv | ||
``` | ||
|
||
## Print comparison table | ||
|
||
Execute `rates.py`. | ||
|
||
## Timings for these predictions using [hyperfine](https://github.com/sharkdp/hyperfine) | ||
|
||
Run in the FGS or FGS+ directory (for the training files). | ||
|
||
```sh | ||
hyperfine 'FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1' \ | ||
'FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1' \ | ||
'FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1' \ | ||
'prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff' | ||
``` | ||
|
||
``` | ||
Benchmark #1: ./FragGeneScan -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS -t complete -w 1 | ||
Time (mean ± σ): 3.797 s ± 0.006 s [User: 3.413 s, System: 0.348 s] | ||
Range (min … max): 3.792 s … 3.807 s 5 runs | ||
Benchmark #2: ./FGS+ -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGS+ -t complete -w 1 | ||
Time (mean ± σ): 369.979 s ± 25.774 s [User: 367.679 s, System: 0.517 s] | ||
Range (min … max): 353.713 s … 415.649 s 5 runs | ||
Benchmark #1: FragGeneScanRs -s meta/evaluation/ena_data_20210917-1328.fasta -o meta/evaluation/FGSrs -t complete -w 1 | ||
Time (mean ± σ): 1.703 s ± 0.014 s [User: 1.395 s, System: 0.275 s] | ||
Range (min … max): 1.684 s … 1.719 s 5 runs | ||
Benchmark #4: prodigal -i meta/evaluation/ena_data_20210917-1328.fasta -f gff -o meta/evaluation/prodigal.gff | ||
Time (mean ± σ): 8.533 s ± 0.038 s [User: 8.453 s, System: 0.047 s] | ||
Range (min … max): 8.493 s … 8.573 s 5 runs | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
with open('ena_data_20210917-1328.txt') as f, open('annotations.csv', 'w') as a, open('readlengths.csv', 'w') as l: | ||
for line in f: | ||
if line.startswith('AC'): | ||
accession = line[2:].strip()[:-1] | ||
elif line.startswith('FT gene'): | ||
span = line[9:].strip() | ||
if span.startswith('complement('): | ||
span = span[11:-1] | ||
strand = '-' | ||
else: | ||
strand = '+' | ||
start, end = span.split('..') | ||
print(accession, start, end, strand, sep=',', file=a) | ||
elif line.startswith('SQ'): | ||
parts = line[2:].strip().split(' ') | ||
assert parts[0] == "Sequence" | ||
print(accession, parts[1], sep=',', file=l) | ||
assert parts[2].startswith("BP") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#!/usr/bin/env python | ||
from collections import namedtuple | ||
from operator import itemgetter | ||
|
||
Annotation = namedtuple('Annotation', ['name', 'start', 'end', 'strand']) | ||
Event = namedtuple('Event', ['location', 'annotation', 'prediction']) | ||
|
||
def parse_readlengths(filename): | ||
with open(filename) as f: | ||
for line in f: | ||
name, length = line.strip().split(',') | ||
yield name, int(length) | ||
|
||
def parse_csv(filename): | ||
with open(filename) as f: | ||
for line in f: | ||
name, start, end, strand = line.strip().split(',') | ||
yield Annotation(name, int(start), int(end), 1 if strand == '+' else -1) | ||
|
||
def events(read_lengths, annotations, predictions): | ||
names = dict() | ||
total = 0 | ||
for name, length in parse_readlengths(read_lengths): | ||
names[name] = total | ||
total += length | ||
|
||
for name, start, end, strand in parse_csv(annotations): | ||
yield Event(names[name] + start, strand, None) | ||
yield Event(names[name] + end, 0, None) | ||
|
||
for name, start, end, strand in parse_csv(predictions): | ||
yield Event(names[name] + start, None, strand) | ||
yield Event(names[name] + end, None, 0) | ||
|
||
def rates(read_lengths, annotations, predictions): | ||
rates = dict(tp=0, fp=0, tn=0, fn=0) | ||
cl = 0 # current location | ||
ca = 0 # current annotation | ||
cp = 0 # current prediction | ||
for l, a, p in sorted(events(read_lengths, annotations, predictions), key=itemgetter(0)): | ||
if ca == 0 and cp == 0: | ||
rates['tn'] += l - cl | ||
elif ca == cp: | ||
rates['tp'] += l - cl | ||
elif ca == 0 and cp != 0: | ||
rates['fp'] += l - cl | ||
elif ca != 0 and cp == 0: | ||
rates['fn'] += l - cl | ||
else: # different strands | ||
rates['fp'] += l - cl | ||
|
||
if a is not None: ca = a | ||
if p is not None: cp = p | ||
cl = l | ||
return rates | ||
|
||
body = '{:<10}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}{:>8.2%}' | ||
head = '{:<10}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}{:>8.4s}' | ||
print(head.format('tool', 'TP', 'FP', 'TN', 'FN', 'precision', 'sensitivity', 'specificity', 'NPV', 'MCC')) | ||
for tool in ['FGS', 'FGS+', 'prodigal', 'FGSrs']: | ||
r = rates('readlengths.csv', 'annotations.csv', f'{tool}.csv') | ||
tp, fp, tn, fn = r['tp'], r['fp'], r['tn'], r['fn'] | ||
t = tp + fp + tn + fn | ||
print(body.format( | ||
tool, | ||
tp / t, | ||
fp / t, | ||
tn / t, | ||
fn / t, | ||
tp / (tp + fp), | ||
tp / (tp + fn), | ||
tn / (tn + fp), | ||
tn / (tn + fn), | ||
(tp * tn - fp * fn) / ((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))**0.5 | ||
)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
BEGIN { print "##gff-version 3"; } | ||
{ | ||
s = substr($1, 1, 1) | ||
if (s == ">") { | ||
seqid = substr($1, 2) | ||
} else { | ||
s = split($0, t, "\t") | ||
id = "ID=" seqid "_" t[1] "_" t[2] "_" t[3] ";product=predicted protein" | ||
print seqid "\tFGS\tCDS\t" t[1] "\t" t[2] "\t.\t" t[3] "\t" int(t[4] - 1) "\t" id | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.