forked from andrewfrank/ggdc-robot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathggdc-robot.py
369 lines (310 loc) · 14.4 KB
/
ggdc-robot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#!/usr/bin/env python
__version__ = '0.0.4'
# Automatically submit jobs to the Genome-to-Genome Distance Calculator (GGDC)
# website: https://ggdc.dsmz.de/ggdc.php, using GGDC v2.1.
# DEPENDENCIES
import argparse
import os
import sys
import itertools
import numpy
import time
import requests
# COMMAND LINE ARGUMENTS
description = ('Automatically submit jobs to the Genome-to-Genome'
'Distance Calculator (GGDC) website:'
'https://ggdc.dsmz.de/ggdc.php. Command line arguments must '
'include either BOTH --queryfile AND --reffile, OR ONLY'
'--samplefile.')
parser = argparse.ArgumentParser(description = description)
ifiles = parser.add_mutually_exclusive_group(required = True)
submit = parser.add_mutually_exclusive_group(required = True)
# Required file arguments
ifiles.add_argument('--samplefile','-s',
help = ('the full path to a text file where each new line '
'contains EITHER the NCBI accession number for all sample '
'sequences OR the full path to a fasta file containing '
'sample sequences; identity for the entire list is parsed '
'from the first entry; NOTICE: THIS OPTION IS '
'POTENTIALLY EXTREMELY COMPUTATIONALLY INTENSIVE.'),
metavar = 'FILE')
ifiles.add_argument('--queryfile','-q',
help = ('the full path to a text file where each new line '
'contains EITHER the NCBI accession number for a query '
'sequence OR the full path to a fasta file containing '
'query sequence(s); identity for the entire list is '
'parsed from the first entry.'),
metavar = 'FILE')
parser.add_argument('--reffile','-r',
help = ('the full path to a text file where each new line '
'contains EITHER the NCBI accession number for a reference '
'sequence OR the full path to a fasta file containing '
'reference sequence(s); identity for the entire list is '
'parsed from the first entry.'),
metavar = 'FILE',
required = '--queryfile' in sys.argv)
# Required arguments
parser.add_argument('--email','-e',
help = 'the email address where GGDC will send results.',
metavar = 'EMAIL ADDRESS',
required = True)
submit.add_argument('--slotusage', '-u',
help = ('enable slot usage waiting mode; this mode forces '
'ggdc-robot to pause for 10 minutes before attempting to '
'submit another job when GGDC servers reach this '
'specificied capacity. E.g. --slotusage 50 will prompt '
'ggdc-robot to wait 10 minutes when GGDC server slot usage '
'reaches 50 percent.'),
metavar = 'PERCENT')
submit.add_argument('--bruteforce', '-f',
help = ('enable brute force mode; this mode forces '
'ggdc-robot to submit jobs to the GGDC server even when the'
'server load is at 100 percent; ATTENTION: jobs may fail due to GGDC job queue limits.'),
action = 'store_true')
# Optional arguments
parser.add_argument('--blastVariant','-b',
help = ('the alignment tool used to determine matches '
'between query and reference genomes; GGDC recommends '
'GBDP2_BLASTPLUS.'),
choices = ['GBDP2_BLASTPLUS','GBDP2_BLAT','GBDP2_BLASTZ',
'GBDP2_WU-BLAST','GBDP2_MUMMER'],
default = 'GBDP2_BLASTPLUS')
parser.add_argument('--timedwait', '-t',
help = ('enable timed waiting mode; this mode forces '
'ggdc-robot to wait the X minutes every Y jobs '
'submitted. E.g. --wait 25 6 will force ggdc-robot to '
'wait 25 minutes between every set of 6 jobs submitted. '
'Can be combined with --usagewait if desired.'),
metavar = ('MINUTES','JOBS'),
type = int,
nargs = 2)
# Help arguments
# parser.add_argument('--verbose','-v',
# help = ('outputs text based checkpoints and interactive '
# 'submission check.'),
# action = 'store_true')
parser.add_argument('--version',
action = 'version',
version = __version__)
args = parser.parse_args()
# FUNCTIONS
# get path this script was called from, useful for creating tmp files
def get_script_path():
return os.path.dirname(os.path.realpath(sys.argv[0]))
# builds a list of lists for all unique 2 way comparisons from input queryfile
# and reffile
def build_pairs_rq(queryfile, reffile):
# get line values from query and ref files
with open(queryfile) as infile:
qlines = infile.read().splitlines()
with open(reffile) as infile:
rlines = infile.read().splitlines()
# create list of query-reference pairs
pairs = []
for qline in qlines:
for rline in rlines:
pair = [qline, rline]
pairs.append(pair)
# create dictionary where query is a key and value is a list of refs
pairs_dict = {}
for pair in pairs:
if pair[0] in pairs_dict:
pairs_dict[pair[0]].append(pair[1])
else:
pairs_dict[pair[0]] = [pair[1]]
return(pairs_dict)
# builds a list of lists for all unique 2 way comparisons from input samplefile
# risks making a fuck ton of comparisons, since it's n choose 2
def build_pairs_all(samplefile):
# get line values from sample file
with open(samplefile) as infile:
lines = infile.read().splitlines()
# create list of query-reference pairs
pairs = list(itertools.combinations(lines,2))
# create dictionary where query is a key and value is a list of refs
pairs_dict = {}
for pair in pairs:
if pair[0] in pairs_dict:
pairs_dict[pair[0]].append(pair[1])
else:
pairs_dict[pair[0]] = [pair[1]]
return(pairs_dict)
# creates qfiles and rfiles for all 2 way comparisons; multiple rfiles of
# roughly the same size per qfile are created when number of refs exceed
# maxrefs value; also outputs useful submission file pair info as a dict
def write_submission_files(pairs_dict, submissions_dir, maxrefs):
# iterate through query-reference pair dictionary
files_dict = {}
for i, (query, refs) in enumerate(pairs_dict.items()):
# break refs into equal sized chunks 75 lines or smaller
nchunks = len(refs)/maxrefs + 1
ref_chunks = numpy.array_split(refs, nchunks)
# write files
for j, ref_chunk in enumerate(ref_chunks):
# write ref file
rfile_name = os.path.join(submissions_dir, # create rfile name
'r' + str(i) + '-' +
str(j) + '.txt')
refs_writeable = '\n'.join(ref_chunk) # format refs
rfile = open(rfile_name, 'w') # open rfile
rfile.write(refs_writeable) # write to rfile
# write query file
qfile_name = os.path.join(submissions_dir, # create qfile name
'q' + str(i) + '-' +
str(j) + '.txt')
qfile = open(qfile_name ,'w') # open qfile
qfile.write(query) # write to qfile
# write qfile rfile pair info to files_dict
files_dict[qfile_name] = rfile_name
return(files_dict)
def check_submission_format(file):
if os.path.isfile(file):
with open(file) as infile:
first_line = infile.readline().rstrip()
if os.path.isfile(first_line):
return('filepath')
else:
return('accession')
else:
sys.exit(file + ' not found.')
def get_ggdc_status(url):
# browse to GGDC website
page_request = requests.get(url)
# check server load
page_html = page_request.text
status_loc = page_html.find('aria-valuenow=') + 15
status = page_html[status_loc]
return(status)
def ggdc_submit(url, email, blastVariant, queryfile, reffile):
# begin filling out GGDC form
form = [('blastVariant',(None,blastVariant)),
("targetName",(None,"")),
("targetGenome", (None, "")),
("refGenbank",(None,"")),
("multipleRefGenomes[]",(None, "")),
('email',(None,email)),
('singlebutton',(None,""))]
# fill in query form from queryfile
queryformat = check_submission_format(queryfile)
with open(queryfile) as infile: qlines = infile.read().splitlines()
qline = qlines[0]
if queryformat == 'accession':
form_value = ("targetName",(None,qline))
form[1] = form_value
elif queryformat == 'filepath':
form_value = ("targetGenome",
(qline,open(qline,"rb"),"application/octet-stream"))
form[2] = form_value
else: sys.exit('Error submitting query' + queryformat + '. Exiting.')
# fill in ref form from reffile
refformat = check_submission_format(reffile)
with open(reffile) as infile: rlines = infile.read().splitlines()
if refformat == 'accession':
control = 'refGenbank'
sep = '\r\n'
values = sep.join(rlines)
form_value = ("refGenbank",(None,values))
form[3] = form_value
elif refformat == 'filepath':
mrg_form_values = []
for rline in rlines:
form_value = ("multipleRefGenomes[]",
(rline, open(rline,"rb"),"application/octet-stream"))
mrg_form_values.append(form_value)
form[4] = mrg_form_values.pop(-1)
for v in range(len(mrg_form_values)):
form.insert(4, mrg_form_values.pop(-1))
else: sys.exit('Unable to submit reference' + refformat + '. Exiting.')
# submit GGDC job
submission = requests.post(url, files = form)
# get GGDC job response
submission_html = submission.content.decode()
if 'job with ID' in submission_html: response = 'Succeeded'
else: response = 'Failed'
return(response)
# iteratively submits each qfile-rfile pair to GGDC using ggdc-crawler.py;
# currently pauses for 25 minutes every 6th submission
def ggdc_submission_controller(status_url, submit_url, email, blastVariant,
files_dict, bruteforce, wait, slotusage):
submission_count = 0
jobs_requested = len(files_dict)
print('Jobs requested = ' + str(jobs_requested))
print('---------------------------------------------------')
for job_count, (qfile, rfile) in enumerate(files_dict.items(), 1):
job_counter = str(job_count) + '/' + str(jobs_requested)
status = get_ggdc_status(status_url)
status_message = 'Current GGDC server load:' + str(status) + '%'
print(status_message)
if bruteforce is False:
if status >= slotusage:
print( slotusage +
(' percent of GGDC server slots are currently used, '
'waiting 10 minutes before attempting additional '
'submissions.'))
print('Waiting to submit job ' + job_counter + '.')
if wait is not None:
print('This is job ' + str(submission_count) +
' of this submission set.')
print('---------------------------------------------------')
time.sleep(600) # wait 10 minutes
if (wait is not None and
submission_count == wait[1] and
job_count <= jobs_requested):
print(wait[1] + ' jobs submitted. Pausing for ' +
wait[0] + ' minutes.')
submission_count = 0
time.sleep(wait[0] * 60)
if job_count <= jobs_requested:
submission = ggdc_submit(submit_url, email, blastVariant,
qfile, rfile)
job_count += 1
submission_count += 1
print('GGDC server submission attempt:')
if 'Succeeded' in submission:
print(submission)
print('Successfully submitted job ' + job_counter + '.')
if wait is not None:
print('This is job ' + str(submission_count) +
' of this submission set.')
print('---------------------------------------------------')
else:
print(submission)
print('Job ' + job_counter +
' failed. Skipping to the next job.')
print('---------------------------------------------------')
time.sleep(2)
else:
print(('Error with GGDC job ' + job_counter +
'. Skipping to the next job.'))
print('---------------------------------------------------')
def main(args):
status_url = 'http://ggdc.dsmz.de/ggdc.php'
submit_url = 'http://ggdc.dsmz.de/submit_ggdc_job.php'
email = args.email
blastVariant = args.blastVariant
queryfile = args.queryfile
reffile = args.reffile
samplefile = args.samplefile
slotusage = args.slotusage
bruteforce = args.bruteforce
wait = args.timedwait
maxrefs = 75
if args.queryfile and args.reffile is not None:
pairs_dict = build_pairs_rq(queryfile,reffile)
elif args.queryfile is not None and args.reffile is None:
sys.exit(('You must submit a query file and reference file together. '
'Exiting.'))
elif args.samplefile is not None:
pairs_dict = build_pairs_all(samplefile)
else:
sys.exit('Error with data file specification on the command line. Exiting.')
submissions_dir = os.path.join(get_script_path(), 'submissions')
if not os.path.exists(submissions_dir): os.makedirs(submissions_dir)
files_dict = write_submission_files(pairs_dict,
submissions_dir,
maxrefs = 75)
ggdc_submission_controller(status_url, submit_url, email, blastVariant,
files_dict, bruteforce, wait, slotusage)
sys.exit('Script complete. Exiting.')
if __name__ == "__main__":
main(args)