Skip to content

Commit

Permalink
More scripts useful for concoct binning evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
alneberg committed Nov 15, 2018
1 parent 7368f0f commit 69044a2
Show file tree
Hide file tree
Showing 6 changed files with 354 additions and 0 deletions.
53 changes: 53 additions & 0 deletions scripts/concoct/approve_bins_from_busco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python
"""
Based on the busco results, approves bins according to the leves of contamination and completeness.
Copies approved bins to output directory.
@author: alneberg
"""
from __future__ import print_function
import sys
import os
import argparse
import pandas as pd
from shutil import copyfile

def main(args):
# Read in the busco table
df = pd.read_table(args.busco_stats, index_col=0, sep=',')
df['completeness_float'] = df['completeness'].apply(lambda x: float(x.replace('%','')))
df['duplicated_float'] = df['duplicated'].apply(lambda x: float(x.replace('%','')))
# extract the ids for all rows that meet the requirements
filtered_df = df[(df['completeness_float'] >= args.min_completeness) & (df['duplicated_float'] <= args.max_contamination)]

if len(filtered_df) < 1:
sys.stderr.write("\nApproved 0 bins\n\n")
return None

approved_bins = list(filtered_df.index)

# copy the approved bins to the new output directory
for approved_bin_int in approved_bins:
approved_bin = str(approved_bin_int)
bin_source = os.path.join(args.bin_directory, approved_bin)
bin_source += '.' + args.extension
bin_destination = os.path.join(args.output_directory)
bin_destination += '/' + os.path.basename(bin_source)

sys.stderr.write("Copying approved bin {} from {} to {}\n".format(approved_bin, bin_source, bin_destination))
copyfile(bin_source, bin_destination)

sys.stderr.write("\nApproved {} bins\n\n".format(len(approved_bins)))

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("bin_directory", help=("Input fasta files should be within directory."))
parser.add_argument("busco_stats", help="Busco stats in csv format with header and bin id as index/first column")
parser.add_argument("output_directory", help="Directory where to put approved bins")
parser.add_argument("--min_completeness", default=30, type=float, help="default=30")
parser.add_argument("--max_contamination", default=10, type=float, help="default=10")
parser.add_argument("--extension", default='fa')
args = parser.parse_args()

main(args)
66 changes: 66 additions & 0 deletions scripts/concoct/assign_metaxa_taxonomy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import pandas as pd
import argparse
from collections import defaultdict
import os
import math

def main(args):
clustering = pd.read_table(args.clustering_file, sep=',', names=['contig_id', 'cluster_id'], index_col=0)
taxonomy_df = pd.read_table(args.taxonomy_file, header=None, index_col=0, names=["contig_id", "taxonomy", "identity", "aln_length", "reliability_score"])
all_approved_prok = pd.read_table(args.all_approved_file, header=None, names=["contig_id"], index_col=0)
if args.all_approved_euk_file:
all_approved_euk = pd.read_table(args.all_approved_euk_file, header=None, names=["contig_id"], index_col=0)

def pair_approved_and_metaxa(all_approved):
all_approved_set = set(all_approved.index.values)
unapproved_rrna = defaultdict(int)
all_clusters_found = set()
approved_rrna = []
taxonomy_df['taxonomy'].fillna('', inplace=True)
for rrna_contig in taxonomy_df.index.values:
if rrna_contig in clustering.index:
cluster_id = clustering.loc[rrna_contig]['cluster_id']
metaxa_val = taxonomy_df.loc[rrna_contig]
if cluster_id in all_approved_set:
tax_dict = metaxa_val.to_dict()
tax_dict['cluster_id'] = cluster_id
tax_dict['contig_id'] = rrna_contig
approved_rrna.append(tax_dict)
all_clusters_found.add(cluster_id)

for cluster_id in all_approved_set:
if cluster_id not in all_clusters_found:
approved_rrna.append({'cluster_id': cluster_id, 'contig_id': '', 'taxonomy': '', 'identity': '', 'aln_length': '', 'reliability_score': ''})

return approved_rrna

approved_rrna_prok = pair_approved_and_metaxa(all_approved_prok)
approved_stats_prok_df = pd.DataFrame(approved_rrna_prok)

approved_rrna_euk = pair_approved_and_metaxa(all_approved_euk)
approved_stats_euk_df = pd.DataFrame(approved_rrna_euk)

columns = ["cluster_id", "contig_id", "taxonomy", "identity", "aln_length", "reliability_score"]
with open(os.path.join(args.outdir, 'approved_prok.tsv'), 'w') as ofh:
if len(approved_stats_prok_df):
approved_stats_prok_df[columns].to_csv(ofh, sep='\t', index=False)
else:
ofh.write('\t'.join(columns) + '\n')
with open(os.path.join(args.outdir, 'approved_euk.tsv'), 'w') as ofh:
if len(approved_stats_euk_df):
approved_stats_euk_df[columns].to_csv(ofh, sep='\t', index=False)
else:
ofh.write('\t'.join(columns) + '\n')


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--clustering_file', help="CONCOCT nocutup merged clustering clustering_nocutup.csv")
parser.add_argument('--taxonomy_file', help="Metaxa output all_contigs.taxonomy.txt")
parser.add_argument('--all_approved_file', help="list_of_all_approved_bins_nocutup.tsv")
parser.add_argument('--all_approved_euk_file', help="list_of_all_approved_bins_nocutup_eukaryotes.tsv")
parser.add_argument('--outdir', help="A directory for output files")

args = parser.parse_args()

main(args)
85 changes: 85 additions & 0 deletions scripts/concoct/extract_16S_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pandas as pd
import argparse
from collections import defaultdict
import os
import math
from Bio import SeqIO

"""A script to extract rrna sequences for approved MAGs given the metaxa results.
"""

def main(args):
clustering = pd.read_table(args.clustering_file, sep=',', names=['contig_id', 'cluster_id'], index_col=0)
taxonomy_df = pd.read_table(args.taxonomy_file, header=None, index_col=0, names=["contig_id", "taxonomy", "identity", "aln_length", "reliability_score"])
all_approved_prok = pd.read_table(args.all_approved_file, header=None, names=["contig_id"], index_col=0)
if args.all_approved_euk_file:
all_approved_euk = pd.read_table(args.all_approved_euk_file, header=None, names=["contig_id"], index_col=0)

def pair_approved_and_metaxa(all_approved):
all_approved_set = set(all_approved.index.values)
contig_to_mag = {}
taxonomy_df['taxonomy'].fillna('', inplace=True)
for rrna_contig in taxonomy_df.index.values:
# Only long contigs are in clustering
if rrna_contig in clustering.index:
cluster_id = clustering.loc[rrna_contig]['cluster_id']

# Only report for approved bins
if cluster_id in all_approved_set:
contig_to_mag[rrna_contig] = cluster_id

return contig_to_mag

contig_to_mag_prok = pair_approved_and_metaxa(all_approved_prok)

contig_to_mag_euk = pair_approved_and_metaxa(all_approved_euk)


def filter_seqs(contig_to_mag, input_fasta ):
# Keep track if a contig is present several times
count_per_contig = {}

output_seqs = []
# Read prok fasta file
for seq in SeqIO.parse(input_fasta, "fasta"):
if seq.id in count_per_contig:
count_per_contig[seq.id] += 1
else:
count_per_contig[seq.id] = 1

# check which MAG it belonged to
if seq.id in contig_to_mag:
mag = contig_to_mag[seq.id]
new_mag_id = args.bin_prefix + str(mag)
new_seq_id = "{}_{}_{}".format(seq.id, count_per_contig[seq.id], new_mag_id)
seq.id = new_seq_id

output_seqs.append(seq)

return output_seqs

prok_output_seqs = filter_seqs(contig_to_mag_prok, args.rrna_prok_fasta_file)

with open(os.path.join(args.outdir, "prok_rRNA.fasta"), 'w') as ofh:
SeqIO.write(prok_output_seqs, ofh, 'fasta')

euk_output_seqs = filter_seqs(contig_to_mag_euk, args.rrna_euk_fasta_file)

with open(os.path.join(args.outdir, "euk_rRNA.fasta"), 'w') as ofh:
SeqIO.write(euk_output_seqs, ofh, 'fasta')

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--clustering_file', help="CONCOCT nocutup merged clustering clustering_nocutup.csv")
parser.add_argument('--taxonomy_file', help="Metaxa output all_contigs.taxonomy.txt")
parser.add_argument('--rrna_prok_fasta_file', help="Metaxa output all_contigs.bacteria.fasta and all_contigs.archaea.fasta combined")
parser.add_argument('--rrna_euk_fasta_file', help="Metaxa output all_contigs.eukaryota.fasta")
parser.add_argument('--all_approved_file', help="list_of_all_approved_bins_nocutup.tsv")
parser.add_argument('--all_approved_euk_file', help="list_of_all_approved_bins_nocutup_eukaryotes.tsv")
parser.add_argument('--bin_prefix', help="Prefix that will be added be fore the integer indicating cluster id")
parser.add_argument('--outdir', help="A directory for output files")

args = parser.parse_args()

main(args)
31 changes: 31 additions & 0 deletions scripts/concoct/extract_stats_for_approved.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python
"""
Based on the checkm results, approves bins according to the leves of contamination and completeness.
Prints the corresponding stats to stdout.
@author: alneberg
"""
from __future__ import print_function
import sys
import os
import argparse
import pandas as pd
from shutil import copyfile

def main(args):
# Read in the checkm table
df = pd.read_table(args.checkm_stats, index_col=0)
# extract the ids for all rows that meet the requirements
filtered_df = df[(df['Completeness'] >= args.min_completeness) & (df['Contamination'] <= args.max_contamination)]

filtered_df.to_csv(sys.stdout, sep='\t')

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("checkm_stats", help="Checkm qa stats in tab_table format")
parser.add_argument("--min_completeness", default=85, type=float, help="default=85")
parser.add_argument("--max_contamination", default=5, type=float, help="default=5")
args = parser.parse_args()

main(args)
71 changes: 71 additions & 0 deletions scripts/dnadiff_eukaryote_reference_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/env python
"""
Parses a large number of dnadiff reports to only report the most significant hits
Only hits with more than 10% of the query bases aligned is reported.
output_folder/fastaname1_vs_fastaname2/
output_folder/fastaname1_vs_fastaname3/
etc
"""
import argparse
import re
import os
from os.path import join as ospj

class MUMmerReport(object):
"""Represents .report file from MUMmer's dnadiff. Stores TotalBases and
AlignedBases (%) stats."""
def __init__(self, report_file):
self.report_file = report_file

with open(report_file) as f:
one_to_one_parsed = False

for line in f:
if line.startswith("TotalBases"):
self.tot_bases = [int(b) for b in line.split()[1:]]
if line.startswith("AlignedBases"):
# store percentage
self.aligned_perc = [float(p) for p in
re.findall(r'\((.*?)\%\)', line)]
self.aligned_bases = [int(n) for n in
re.findall(r'\W([0-9]*)\(', line)]
if not one_to_one_parsed and line.startswith("AvgIdentity"):
self.avg_identity = [float(p) for p in line.split()[1:]][0]
one_to_one_parsed = True
self.aligned_perc_ref = self.aligned_perc[0]
self.aligned_perc_mag = self.aligned_perc[1]
self.aligned_bases_ref = self.aligned_bases[0]
self.aligned_bases_mag = self.aligned_bases[1]

def main(args):
# Get fasta names

# Get basename from fasta files and see if those are unique
fasta_names_ref = [".".join(os.path.basename(f).split(".")[0:-1]) for f in
args.fasta_files_ref]
fasta_names_mag = [os.path.basename(f).split(".")[0] for f in
args.fasta_files_mag]

print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format("ref_fasta_name", "mag_fasta_name", "aligned_bases_ref", "aligned_perc_ref", "aligned_bases_mag", "aligned_perc_mag", "avg_identity"))
for ref_fasta_name in fasta_names_ref:
for mag_fasta_name in fasta_names_mag:
repfile = ospj(args.input_dir, "{fn1}_vs_{fn2}.report".format(
fn1=ref_fasta_name, fn2=mag_fasta_name))
mumr = MUMmerReport(repfile)
if mumr.aligned_perc_mag >= args.min_coverage:
print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(ref_fasta_name, mag_fasta_name, mumr.aligned_bases_ref, mumr.aligned_perc_ref, mumr.aligned_bases_mag, mumr.aligned_perc_mag, mumr.avg_identity))

if __name__ == "__main__":
"""Return input arguments using argparse"""
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("input_dir")
parser.add_argument("--fasta_files_ref", nargs='+')
parser.add_argument("--fasta_files_mag", nargs='+')
parser.add_argument("--min_coverage", type=float, default=10)
args = parser.parse_args()
main(args)
48 changes: 48 additions & 0 deletions scripts/parse_augustus_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env python
"""
Reads an augustus output file and prints the amino_acid fasta file to stdout
@author: alneberg
"""
import sys
import os
import argparse

def to_fasta(s, gene_id, contig_id):
print('>{}_{}'.format(contig_id, gene_id))
print(s)


def main(args):
with open(args.augustus_output) as ifh:
protein_str = None
for line in ifh:
line = line.strip()
# Check contig_id
if not line.startswith('#'):
contig_id = line.split('\t')[0]
# Check if protein starts
elif line.startswith('# protein sequence'):
line = line.replace('# protein sequence = [', '')
protein_str = line.replace(']','')
protein_str = line
elif protein_str:
# If protein ends
# Parse lines and output to stdout
if line.startswith('# end gene'):
line = line.replace('# end gene ', '')
gene_id = line
to_fasta(protein_str, gene_id, contig_id)
protein_str = None
else:
# add to protein lines
line = line.replace('# ', '')
line = line.replace(']', '')
protein_str += line

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("augustus_output", help=("Standard output format of augustus."))
args = parser.parse_args()

main(args)

0 comments on commit 69044a2

Please sign in to comment.