-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
More scripts useful for concoct binning evaluation
- Loading branch information
Showing
6 changed files
with
354 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Based on the busco results, approves bins according to the leves of contamination and completeness. | ||
Copies approved bins to output directory. | ||
@author: alneberg | ||
""" | ||
from __future__ import print_function | ||
import sys | ||
import os | ||
import argparse | ||
import pandas as pd | ||
from shutil import copyfile | ||
|
||
def main(args): | ||
# Read in the busco table | ||
df = pd.read_table(args.busco_stats, index_col=0, sep=',') | ||
df['completeness_float'] = df['completeness'].apply(lambda x: float(x.replace('%',''))) | ||
df['duplicated_float'] = df['duplicated'].apply(lambda x: float(x.replace('%',''))) | ||
# extract the ids for all rows that meet the requirements | ||
filtered_df = df[(df['completeness_float'] >= args.min_completeness) & (df['duplicated_float'] <= args.max_contamination)] | ||
|
||
if len(filtered_df) < 1: | ||
sys.stderr.write("\nApproved 0 bins\n\n") | ||
return None | ||
|
||
approved_bins = list(filtered_df.index) | ||
|
||
# copy the approved bins to the new output directory | ||
for approved_bin_int in approved_bins: | ||
approved_bin = str(approved_bin_int) | ||
bin_source = os.path.join(args.bin_directory, approved_bin) | ||
bin_source += '.' + args.extension | ||
bin_destination = os.path.join(args.output_directory) | ||
bin_destination += '/' + os.path.basename(bin_source) | ||
|
||
sys.stderr.write("Copying approved bin {} from {} to {}\n".format(approved_bin, bin_source, bin_destination)) | ||
copyfile(bin_source, bin_destination) | ||
|
||
sys.stderr.write("\nApproved {} bins\n\n".format(len(approved_bins))) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("bin_directory", help=("Input fasta files should be within directory.")) | ||
parser.add_argument("busco_stats", help="Busco stats in csv format with header and bin id as index/first column") | ||
parser.add_argument("output_directory", help="Directory where to put approved bins") | ||
parser.add_argument("--min_completeness", default=30, type=float, help="default=30") | ||
parser.add_argument("--max_contamination", default=10, type=float, help="default=10") | ||
parser.add_argument("--extension", default='fa') | ||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import pandas as pd | ||
import argparse | ||
from collections import defaultdict | ||
import os | ||
import math | ||
|
||
def main(args): | ||
clustering = pd.read_table(args.clustering_file, sep=',', names=['contig_id', 'cluster_id'], index_col=0) | ||
taxonomy_df = pd.read_table(args.taxonomy_file, header=None, index_col=0, names=["contig_id", "taxonomy", "identity", "aln_length", "reliability_score"]) | ||
all_approved_prok = pd.read_table(args.all_approved_file, header=None, names=["contig_id"], index_col=0) | ||
if args.all_approved_euk_file: | ||
all_approved_euk = pd.read_table(args.all_approved_euk_file, header=None, names=["contig_id"], index_col=0) | ||
|
||
def pair_approved_and_metaxa(all_approved): | ||
all_approved_set = set(all_approved.index.values) | ||
unapproved_rrna = defaultdict(int) | ||
all_clusters_found = set() | ||
approved_rrna = [] | ||
taxonomy_df['taxonomy'].fillna('', inplace=True) | ||
for rrna_contig in taxonomy_df.index.values: | ||
if rrna_contig in clustering.index: | ||
cluster_id = clustering.loc[rrna_contig]['cluster_id'] | ||
metaxa_val = taxonomy_df.loc[rrna_contig] | ||
if cluster_id in all_approved_set: | ||
tax_dict = metaxa_val.to_dict() | ||
tax_dict['cluster_id'] = cluster_id | ||
tax_dict['contig_id'] = rrna_contig | ||
approved_rrna.append(tax_dict) | ||
all_clusters_found.add(cluster_id) | ||
|
||
for cluster_id in all_approved_set: | ||
if cluster_id not in all_clusters_found: | ||
approved_rrna.append({'cluster_id': cluster_id, 'contig_id': '', 'taxonomy': '', 'identity': '', 'aln_length': '', 'reliability_score': ''}) | ||
|
||
return approved_rrna | ||
|
||
approved_rrna_prok = pair_approved_and_metaxa(all_approved_prok) | ||
approved_stats_prok_df = pd.DataFrame(approved_rrna_prok) | ||
|
||
approved_rrna_euk = pair_approved_and_metaxa(all_approved_euk) | ||
approved_stats_euk_df = pd.DataFrame(approved_rrna_euk) | ||
|
||
columns = ["cluster_id", "contig_id", "taxonomy", "identity", "aln_length", "reliability_score"] | ||
with open(os.path.join(args.outdir, 'approved_prok.tsv'), 'w') as ofh: | ||
if len(approved_stats_prok_df): | ||
approved_stats_prok_df[columns].to_csv(ofh, sep='\t', index=False) | ||
else: | ||
ofh.write('\t'.join(columns) + '\n') | ||
with open(os.path.join(args.outdir, 'approved_euk.tsv'), 'w') as ofh: | ||
if len(approved_stats_euk_df): | ||
approved_stats_euk_df[columns].to_csv(ofh, sep='\t', index=False) | ||
else: | ||
ofh.write('\t'.join(columns) + '\n') | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--clustering_file', help="CONCOCT nocutup merged clustering clustering_nocutup.csv") | ||
parser.add_argument('--taxonomy_file', help="Metaxa output all_contigs.taxonomy.txt") | ||
parser.add_argument('--all_approved_file', help="list_of_all_approved_bins_nocutup.tsv") | ||
parser.add_argument('--all_approved_euk_file', help="list_of_all_approved_bins_nocutup_eukaryotes.tsv") | ||
parser.add_argument('--outdir', help="A directory for output files") | ||
|
||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import pandas as pd | ||
import argparse | ||
from collections import defaultdict | ||
import os | ||
import math | ||
from Bio import SeqIO | ||
|
||
"""A script to extract rrna sequences for approved MAGs given the metaxa results. | ||
""" | ||
|
||
def main(args): | ||
clustering = pd.read_table(args.clustering_file, sep=',', names=['contig_id', 'cluster_id'], index_col=0) | ||
taxonomy_df = pd.read_table(args.taxonomy_file, header=None, index_col=0, names=["contig_id", "taxonomy", "identity", "aln_length", "reliability_score"]) | ||
all_approved_prok = pd.read_table(args.all_approved_file, header=None, names=["contig_id"], index_col=0) | ||
if args.all_approved_euk_file: | ||
all_approved_euk = pd.read_table(args.all_approved_euk_file, header=None, names=["contig_id"], index_col=0) | ||
|
||
def pair_approved_and_metaxa(all_approved): | ||
all_approved_set = set(all_approved.index.values) | ||
contig_to_mag = {} | ||
taxonomy_df['taxonomy'].fillna('', inplace=True) | ||
for rrna_contig in taxonomy_df.index.values: | ||
# Only long contigs are in clustering | ||
if rrna_contig in clustering.index: | ||
cluster_id = clustering.loc[rrna_contig]['cluster_id'] | ||
|
||
# Only report for approved bins | ||
if cluster_id in all_approved_set: | ||
contig_to_mag[rrna_contig] = cluster_id | ||
|
||
return contig_to_mag | ||
|
||
contig_to_mag_prok = pair_approved_and_metaxa(all_approved_prok) | ||
|
||
contig_to_mag_euk = pair_approved_and_metaxa(all_approved_euk) | ||
|
||
|
||
def filter_seqs(contig_to_mag, input_fasta ): | ||
# Keep track if a contig is present several times | ||
count_per_contig = {} | ||
|
||
output_seqs = [] | ||
# Read prok fasta file | ||
for seq in SeqIO.parse(input_fasta, "fasta"): | ||
if seq.id in count_per_contig: | ||
count_per_contig[seq.id] += 1 | ||
else: | ||
count_per_contig[seq.id] = 1 | ||
|
||
# check which MAG it belonged to | ||
if seq.id in contig_to_mag: | ||
mag = contig_to_mag[seq.id] | ||
new_mag_id = args.bin_prefix + str(mag) | ||
new_seq_id = "{}_{}_{}".format(seq.id, count_per_contig[seq.id], new_mag_id) | ||
seq.id = new_seq_id | ||
|
||
output_seqs.append(seq) | ||
|
||
return output_seqs | ||
|
||
prok_output_seqs = filter_seqs(contig_to_mag_prok, args.rrna_prok_fasta_file) | ||
|
||
with open(os.path.join(args.outdir, "prok_rRNA.fasta"), 'w') as ofh: | ||
SeqIO.write(prok_output_seqs, ofh, 'fasta') | ||
|
||
euk_output_seqs = filter_seqs(contig_to_mag_euk, args.rrna_euk_fasta_file) | ||
|
||
with open(os.path.join(args.outdir, "euk_rRNA.fasta"), 'w') as ofh: | ||
SeqIO.write(euk_output_seqs, ofh, 'fasta') | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--clustering_file', help="CONCOCT nocutup merged clustering clustering_nocutup.csv") | ||
parser.add_argument('--taxonomy_file', help="Metaxa output all_contigs.taxonomy.txt") | ||
parser.add_argument('--rrna_prok_fasta_file', help="Metaxa output all_contigs.bacteria.fasta and all_contigs.archaea.fasta combined") | ||
parser.add_argument('--rrna_euk_fasta_file', help="Metaxa output all_contigs.eukaryota.fasta") | ||
parser.add_argument('--all_approved_file', help="list_of_all_approved_bins_nocutup.tsv") | ||
parser.add_argument('--all_approved_euk_file', help="list_of_all_approved_bins_nocutup_eukaryotes.tsv") | ||
parser.add_argument('--bin_prefix', help="Prefix that will be added be fore the integer indicating cluster id") | ||
parser.add_argument('--outdir', help="A directory for output files") | ||
|
||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Based on the checkm results, approves bins according to the leves of contamination and completeness. | ||
Prints the corresponding stats to stdout. | ||
@author: alneberg | ||
""" | ||
from __future__ import print_function | ||
import sys | ||
import os | ||
import argparse | ||
import pandas as pd | ||
from shutil import copyfile | ||
|
||
def main(args): | ||
# Read in the checkm table | ||
df = pd.read_table(args.checkm_stats, index_col=0) | ||
# extract the ids for all rows that meet the requirements | ||
filtered_df = df[(df['Completeness'] >= args.min_completeness) & (df['Contamination'] <= args.max_contamination)] | ||
|
||
filtered_df.to_csv(sys.stdout, sep='\t') | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("checkm_stats", help="Checkm qa stats in tab_table format") | ||
parser.add_argument("--min_completeness", default=85, type=float, help="default=85") | ||
parser.add_argument("--max_contamination", default=5, type=float, help="default=5") | ||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Parses a large number of dnadiff reports to only report the most significant hits | ||
Only hits with more than 10% of the query bases aligned is reported. | ||
output_folder/fastaname1_vs_fastaname2/ | ||
output_folder/fastaname1_vs_fastaname3/ | ||
etc | ||
""" | ||
import argparse | ||
import re | ||
import os | ||
from os.path import join as ospj | ||
|
||
class MUMmerReport(object): | ||
"""Represents .report file from MUMmer's dnadiff. Stores TotalBases and | ||
AlignedBases (%) stats.""" | ||
def __init__(self, report_file): | ||
self.report_file = report_file | ||
|
||
with open(report_file) as f: | ||
one_to_one_parsed = False | ||
|
||
for line in f: | ||
if line.startswith("TotalBases"): | ||
self.tot_bases = [int(b) for b in line.split()[1:]] | ||
if line.startswith("AlignedBases"): | ||
# store percentage | ||
self.aligned_perc = [float(p) for p in | ||
re.findall(r'\((.*?)\%\)', line)] | ||
self.aligned_bases = [int(n) for n in | ||
re.findall(r'\W([0-9]*)\(', line)] | ||
if not one_to_one_parsed and line.startswith("AvgIdentity"): | ||
self.avg_identity = [float(p) for p in line.split()[1:]][0] | ||
one_to_one_parsed = True | ||
self.aligned_perc_ref = self.aligned_perc[0] | ||
self.aligned_perc_mag = self.aligned_perc[1] | ||
self.aligned_bases_ref = self.aligned_bases[0] | ||
self.aligned_bases_mag = self.aligned_bases[1] | ||
|
||
def main(args): | ||
# Get fasta names | ||
|
||
# Get basename from fasta files and see if those are unique | ||
fasta_names_ref = [".".join(os.path.basename(f).split(".")[0:-1]) for f in | ||
args.fasta_files_ref] | ||
fasta_names_mag = [os.path.basename(f).split(".")[0] for f in | ||
args.fasta_files_mag] | ||
|
||
print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format("ref_fasta_name", "mag_fasta_name", "aligned_bases_ref", "aligned_perc_ref", "aligned_bases_mag", "aligned_perc_mag", "avg_identity")) | ||
for ref_fasta_name in fasta_names_ref: | ||
for mag_fasta_name in fasta_names_mag: | ||
repfile = ospj(args.input_dir, "{fn1}_vs_{fn2}.report".format( | ||
fn1=ref_fasta_name, fn2=mag_fasta_name)) | ||
mumr = MUMmerReport(repfile) | ||
if mumr.aligned_perc_mag >= args.min_coverage: | ||
print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(ref_fasta_name, mag_fasta_name, mumr.aligned_bases_ref, mumr.aligned_perc_ref, mumr.aligned_bases_mag, mumr.aligned_perc_mag, mumr.avg_identity)) | ||
|
||
if __name__ == "__main__": | ||
"""Return input arguments using argparse""" | ||
parser = argparse.ArgumentParser(description=__doc__, | ||
formatter_class=argparse.RawDescriptionHelpFormatter) | ||
parser.add_argument("input_dir") | ||
parser.add_argument("--fasta_files_ref", nargs='+') | ||
parser.add_argument("--fasta_files_mag", nargs='+') | ||
parser.add_argument("--min_coverage", type=float, default=10) | ||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Reads an augustus output file and prints the amino_acid fasta file to stdout | ||
@author: alneberg | ||
""" | ||
import sys | ||
import os | ||
import argparse | ||
|
||
def to_fasta(s, gene_id, contig_id): | ||
print('>{}_{}'.format(contig_id, gene_id)) | ||
print(s) | ||
|
||
|
||
def main(args): | ||
with open(args.augustus_output) as ifh: | ||
protein_str = None | ||
for line in ifh: | ||
line = line.strip() | ||
# Check contig_id | ||
if not line.startswith('#'): | ||
contig_id = line.split('\t')[0] | ||
# Check if protein starts | ||
elif line.startswith('# protein sequence'): | ||
line = line.replace('# protein sequence = [', '') | ||
protein_str = line.replace(']','') | ||
protein_str = line | ||
elif protein_str: | ||
# If protein ends | ||
# Parse lines and output to stdout | ||
if line.startswith('# end gene'): | ||
line = line.replace('# end gene ', '') | ||
gene_id = line | ||
to_fasta(protein_str, gene_id, contig_id) | ||
protein_str = None | ||
else: | ||
# add to protein lines | ||
line = line.replace('# ', '') | ||
line = line.replace(']', '') | ||
protein_str += line | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("augustus_output", help=("Standard output format of augustus.")) | ||
args = parser.parse_args() | ||
|
||
main(args) |