-
Notifications
You must be signed in to change notification settings - Fork 2
/
run.py
162 lines (139 loc) · 5.42 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/python
"""Wrapper script to run FAMLI on one or more FASTQ files."""
import os
import uuid
import time
import shutil
import logging
import argparse
from lib.midas_helpers import run_midas
from lib.midas_helpers import parse_midas_output
from lib.exec_helpers import get_reference_database
from lib.exec_helpers import return_results
from lib.exec_helpers import exit_and_clean_up
from lib.fastq_helpers import get_reads_from_url
from lib.fastq_helpers import count_fastq_reads
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""
Analyze a set of reads with MIDAS.
""")
parser.add_argument("--input",
type=str,
help="""Location for input file(s). Comma-separated.
(Supported: sra://, s3://, or ftp://).""")
parser.add_argument("--ref-db",
type=str,
help="""Folder containing reference database.
(Supported: s3://, ftp://, or local path).""")
parser.add_argument("--output-folder",
type=str,
help="""Folder to place results.
(Supported: s3://, or local path).""")
parser.add_argument("--threads",
type=int,
default=16,
help="Number of threads to use aligning.")
parser.add_argument("--temp-folder",
type=str,
default='/share',
help="Folder used for temporary files.")
args = parser.parse_args()
# Make a temporary folder for all files to be placed in
temp_folder = os.path.join(args.temp_folder, str(uuid.uuid4())[:8])
assert os.path.exists(temp_folder) is False
os.mkdir(temp_folder)
# Set up logging
log_fp = os.path.join(temp_folder, "log.txt")
logFormatter = logging.Formatter(
'%(asctime)s %(levelname)-8s [MIDAS] %(message)s'
)
rootLogger = logging.getLogger()
rootLogger.setLevel(logging.INFO)
# Write to file
fileHandler = logging.FileHandler(log_fp)
fileHandler.setFormatter(logFormatter)
rootLogger.addHandler(fileHandler)
# Also write to STDOUT
consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
rootLogger.addHandler(consoleHandler)
# Get the reference database
try:
db_fp = get_reference_database(
args.ref_db,
temp_folder
)
except:
exit_and_clean_up(temp_folder)
logging.info("Reference database: " + db_fp)
# Align each of the inputs and calculate the overall abundance
for input_str in args.input.split(','):
# Keep track of the time elapsed to process each sample
start_time = time.time()
# Make a new temporary folder for this sample
sample_temp_folder = os.path.join(temp_folder, str(uuid.uuid4())[:8])
assert os.path.exists(sample_temp_folder) is False
logging.info(
"Making temp folder for this sample: {}".format(sample_temp_folder)
)
os.mkdir(sample_temp_folder)
logging.info("Processing input argument: " + input_str)
# Capture each command in a try statement
# Get the input reads
try:
read_fp = get_reads_from_url(input_str, sample_temp_folder)
except:
exit_and_clean_up(temp_folder)
# Run MIDAS
try:
output_folder = run_midas(
read_fp, # FASTQ file path
db_fp, # Local path to DB
sample_temp_folder, # Folder for results
threads=args.threads,
)
except:
exit_and_clean_up(temp_folder)
# Parse the output
try:
midas_summary = parse_midas_output(
output_folder,
ref_db=db_fp,
)
except:
exit_and_clean_up(temp_folder)
# Name the output file based on the input file
# Ultimately adding ".json.gz" to the input file name
output_prefix = input_str.split("/")[-1]
# Count the total number of reads
logging.info("Counting the total number of reads")
n_reads = count_fastq_reads(read_fp)
logging.info("Reads in input file: {}".format(n_reads))
# Read in the logs
logging.info("Reading in the logs")
logs = open(log_fp, 'rt').readlines()
# Wrap up all of the results into a single JSON
# and write it to the output folder
output = {
"input_path": input_str,
"input": output_prefix,
"output_folder": args.output_folder,
"logs": logs,
"ref_db": db_fp,
"ref_db_url": args.ref_db,
"results": midas_summary,
"total_reads": n_reads,
"time_elapsed": time.time() - start_time
}
return_results(
output, output_prefix, args.output_folder, sample_temp_folder
)
# Delete any files that were created for this sample
logging.info("Removing temporary folder: " + sample_temp_folder)
shutil.rmtree(sample_temp_folder)
# Delete all files created for this analysis
logging.info("Removing temporary folder: " + temp_folder)
shutil.rmtree(temp_folder)
# Stop logging
logging.info("Done")
logging.shutdown()