-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpmg_phylogeny.py
317 lines (268 loc) · 9.62 KB
/
pmg_phylogeny.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#!/home/frankaylward/bin
import argparse
import os
import sys
import subprocess
import re
import shlex
import pandas
#import glob
import operator
from collections import defaultdict
from Bio import SeqIO
# This script requires that HMMER3 and Prodigal are install in the PATH.
# create and wipe the LOGFILE before beginning
o = open("LOGFILE.txt", "w")
o.close()
#########################################
### define prodigal launcher function ###
#########################################
suffix = ".fa"
def predict_genes(folder):
for filenames in os.listdir(folder):
if filenames.endswith(".fna") or filenames.endswith(".fa"):
s = filenames.split(".")
suffix = "."+s[-1]
#print "Predicting genes..."
input_file = os.path.join(folder, filenames)
protein_file = re.sub(suffix, ".faa", input_file)
gff_file = re.sub(suffix, ".gff", input_file)
acc = re.sub(suffix, "", filenames)
cmd = "prodigal -i "+ input_file +" -f gff -o "+ gff_file +" -a "+ protein_file
#print cmd
cmd2 = shlex.split(cmd)
subprocess.call(cmd2, stdout=open("LOGFILE.txt", 'w'), stderr=open("LOGFILE.txt", 'a'))
def make_nr(folder):
for filenames in os.listdir(folder):
if filenames.endswith(".faa"):
protein_file = os.path.join(folder, filenames)
acc1 = re.sub(".faa", "", filenames)
acc = re.sub("_protein", "", acc1)
# make protein names non-redundant
output_seqs = []
handle = open(protein_file, "r")
for record in SeqIO.parse(handle, "fasta"):
name = record.id
if ".." in name:
pass
else:
record.id = acc +".."+ name
#print record.id, acc, filenames, name
output_seqs.append(record)
handle.close()
output = open(protein_file, "w")
SeqIO.write(output_seqs, output, "fasta")
# end
#######################################
### define HMMER3 launcher function ###
#######################################
def run_hmmer(folder, db_path, cutoff):
for filenames in os.listdir(folder):
if filenames.endswith(".faa"):
input_file = os.path.join(folder, filenames)
hmm_out = re.sub(".faa", ".hmm_out", input_file)
dom_out = re.sub(".faa", ".hmm_dom_out", input_file)
cmd = cmd = "hmmsearch --cpu 16 --tblout " + hmm_out + " "+ cutoff +" --domtblout "+ dom_out + " " + db_path + " " + input_file
cmd2 = shlex.split(cmd)
#subprocess.call(cmd2, stdout=open("LOGFILE.txt", 'w'), stderr=open("LOGFILE.txt", 'a'))
# end
######################################
######## get protein dictionary ######
######################################
def parse_faa(folder, dictionary):
seq_dict = {}
for filenames in os.listdir(folder):
if filenames.endswith(".faa"):
input_file = os.path.join(folder, filenames)
for j in SeqIO.parse(input_file, "fasta"):
#print j.id
if j.id in dictionary:
#print j.id
seq_dict[j.id] = j
return seq_dict
# end
####################################
#### define HMM output parser ######
####################################
def hmm_parser(folder, suffix, output):
full_hit_dict = {}
cog_dict = defaultdict(list)
score_list = {}
prot_list = []
combined_output = open(output, "w")
#combined_output.write("protein\tacc\thit\tscore\tlength\tcategory\tspecies\tdomain\tphylum\torder\tp2\tp3\n")
hits = []
bit_dict = {}
df = pandas.DataFrame()
for filenames in os.listdir(folder):
if filenames.endswith(suffix):
acc = re.sub(suffix, "", filenames)
f = open(folder+"/"+filenames, 'r')
hit_dict = {}
bit_dict = defaultdict(int)
#bit_dict = {}
hit_type = {}
marker_dict = {}
for line in f.readlines():
if line.startswith("#"):
pass
else:
newline = re.sub( '\s+', '\t', line)
list1 = newline.split('\t')
ids = list1[0]
hit = list1[2]
acc2 = list1[3]
if "COG" in hit:
pass
else:
hit = acc2
bit_score = list1[5]
score = float(bit_score)
if score > bit_dict[ids]:
hit_dict[ids] = hit
bit_dict[ids] = score
bit_sorted = sorted(bit_dict.items(), key=operator.itemgetter(1), reverse=True)
output_list = []
for item in bit_sorted:
entry = item[0]
if entry in hit_dict:
#if entry in bit_dict and entry in hit_dict:
output_list.append(entry +"\t"+ str(hit_dict[entry]) +"\t"+ str(bit_dict[entry]))
full_hit_dict[entry] = entry
#print entry
#else:
# print entry
#parsed = open(folder+"/"+filenames+".parsed", 'r')
hit_profile = defaultdict(int)
done = []
for line in output_list:
line1 = line.rstrip()
tabs = line1.split("\t")
ids = tabs[0]
hits.append(ids)
cog = tabs[1]
score = tabs[2]
nr = acc +"_"+ cog
if nr in done:
combined_output.write(ids +"\t"+ acc +"\t"+ cog +"\t"+ score +"\tNH\t"+ "\n")
#pass
else:
combined_output.write(ids +"\t"+ acc +"\t"+ cog +"\t"+ score +"\tBH\t"+ "\n")
done.append(nr)
#s1 = pandas.DataFrame(pandas.Series(hit_profile, name = acc))
#df = pandas.concat([df, s1], axis=1)
return full_hit_dict
# end
########################################################################
##### use argparse to run through the command line options given #######
########################################################################
args_parser = argparse.ArgumentParser(description="Script for identifying phylogenetic marker genes (PMGs) in sequencing data\nFrank O. Aylward, Assistant Professor, Virginia Tech Department of Biological Sciences <faylward at vt dot edu>", epilog="Virginia Tech Department of Biological Sciences")
args_parser.add_argument('-i', '--input_folder', required=True, help='Input folder of FNA sequence files')
args_parser.add_argument('-c', '--cogs', required=True, help='output file for the COGs file, used in ete3')
args_parser.add_argument('-o', '--output_faa', required=True, help='amino acid fasta output file of the PMGs identified')
args_parser.add_argument('-f', '--genes', type=bool, default=False, const=True, nargs='?', help='Implies genes have already been predicted, and .faa files are already in the input folder)')
args_parser.add_argument('-n', '--nr', type=bool, default=False, const=True, nargs='?', help='Implies genes are already non-redundant and no re-naming is necessary)')
args_parser.add_argument('-d', '--db', required=True, help='Database for matching. Accepted sets are "embl", "checkm_bact", "checkm_arch", "rpl1_arch", "rpl1_bact", or "rpl2"')
args_parser = args_parser.parse_args()
# set up object names for input/output/database folders
input_dir = args_parser.input_folder
cogs_file = args_parser.cogs
faa_out = args_parser.output_faa
db = args_parser.db
db_path = os.path.join("hmm/", db+".hmm")
######## Check if genes have already been predicted.
if not (args_parser.genes):
print "\nPredicting genes...\n"
predict_genes(input_dir)
else:
print "\nGenes already predicted.\n"
####### Check if predicted proteins should be made non-redundant
if not (args_parser.nr):
print "\nMaking protein entries non-redundant...\n"
make_nr(input_dir)
else:
print "\nGenes already non-redundant.\n"
###### Get a list of the HMM entries that should be used
cogs = []
list_file = open(os.path.join("hmm/", db+".list"), "r")
for i in list_file.readlines():
line = i.rstrip()
cogs.append(line)
###########################################
################ run functions ############
###########################################
cutoff_dict = {}
if "checkm" in db:
cutoff = "--cut_nc"
else:
cutoff = " "
cutoff_file = open("hmm/embl_cutoffs.txt", "r")
for i in cutoff_file.readlines():
line = i.rstrip()
tabs = line.split("\t")
name = tabs[0]
value = tabs[1]
cutoff_dict[name] = float(value)
print "Running HMMER3...\n"
run_hmmer(input_dir, db_path, cutoff)
print "Parsing HMMER3 outputs...\n"
all_hits = hmm_parser(input_dir, ".hmm_out", "all_hmm_out.txt")
#print all_hits
# this is old code to output an annotation table- it may be useful in some cases but isn't part of the core code.
#name2 = 'hmm_profile.speci.txt'
#df.fillna(0, inplace=True, axis=1)
#df2 = speci_df.transpose()
#df2.to_csv(name2, sep='\t')
print "Getting protein dictionary...\n"
# get a dictionary that links protein IDs to their SeqIO records
seq_dict = parse_faa(input_dir, all_hits)
###########################################################################
######### Final compilation of files for input to ete3 ####################
###########################################################################
print "Final compilation...\n"
hmm_out = open("all_hmm_out.txt", "r")
acc_dict = {}
cog_dict = defaultdict(list)
for j in hmm_out.readlines():
line = j.rstrip()
tabs = line.split("\t")
hit_type = tabs[4]
if hit_type == "BH":
protein = tabs[0]
acc = tabs[1]
cog = tabs[2]
score = tabs[3]
if db == "embl":
if score >= cutoff_dict[cog]:
acc_dict[protein] = acc
#print protein, acc, cog
cog_dict[cog].append(protein)
else:
acc_dict[protein] = acc
#print protein, acc, cog
cog_dict[cog].append(protein)
#print cog_dict
cog_file = hmm_out = open(cogs_file, "w")
out_proteins = open(faa_out, "w")
output_seqs = []
tally = []
tally2 = []
for n in cogs:
protein_list = cog_dict[n]
#print protein_list
for i in protein_list:
acc = acc_dict[i]
acc2 = re.sub("_", ".", acc)
new_name = acc2 +"_"+ n
cog_file.write(new_name +"\t")
tally.append(new_name)
record = seq_dict[i]
record.description = record.id
#record.description = ""
record.id = new_name
output_seqs.append(record)
tally2.append(record.id)
cog_file.write("\n")
SeqIO.write(output_seqs, out_proteins, "fasta")
print len(tally), len(tally2), len(set(tally)), len(set(tally2))
# and then run through ete3 using the command "ete3 build -a proteins_for_phylogeny.faa --cogs cog_file.txt -w standard_trimmed_fasttree -m sptree_fasttree_90 --clearall -C 8 -o test_tree"