-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
executable file
·247 lines (188 loc) · 8.04 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import os
import sys
from pathlib import Path
import h5py
import numpy as np
import pandas as pd
for k in config.keys():
globals()[k] = config[k]
if "min_read_q" not in globals():
min_read_q = 0
basecall_group = 'Basecall_1D_%s' % basecall_id
if 'unique_samples' not in config.keys():
unique_samples = os.listdir(os.path.join(basedir, 'raw'))
if 'fastq_ending' not in config.keys():
fastq_ending = 'fq'
if 'chroms' not in config.keys():
with open(reference, "r") as f:
print("Trying to infer chromosomes from reference")
chroms = [l[1:].split(" ")[0] for l in f if l.startswith(">")]
chroms = [str(c) for c in chroms]
#basedir = Path(basedir)
'''
Detect batches for each sample. Fills the global variables:
samples: flat array of length num_samples x num_batches
batches: flat array of length num_samples x num_batches
samplebatches: dict containing the list of batches per sample
samples and batches are filled such that zip(samples,batches) would result
in the full list of sample and batch tuples.
'''
class SampleBatchesFilenames:
def __init__(self):
if os.path.exists(os.path.join(basedir, 'raw')):
sbf = glob_wildcards(os.path.join(basedir, 'raw', '{sample}', 'batched', '{batch}', '{filename}.fast5'))
self.sbf_samples = sbf.sample
self.sbf_batches = sbf.batch
self.sbf_filenames = sbf.filename
sbdict = {}
for s, b in zip(sbf.sample, sbf.batch):
sbdict[f"{s}_{b}"] = (s,b)
self.sb_samples = [s for s,_ in sbdict.values()]
self.sb_batches = [b for _,b in sbdict.values()]
elif os.path.exists(os.path.join(basedir, 'fastq')):
sb = glob_wildcards(os.path.join(basedir, 'fastq', '{sample}', '{batch}.%s' % fastq_ending))
self.sb_samples = sb.sample
self.sb_batches = sb.batch
self.sbf_samples = None
self.sbf_batches = None
self.sbf_filenames = None
sbf = SampleBatchesFilenames()
def samplebatches(sample):
return [sbf.sb_batches[i] for i in range(len(sbf.sb_batches)) if sbf.sb_samples[i] == sample]
'''
##############################################################################
# Combinators
##############################################################################
Below we define some combinators of wildcards. These can be used in "all_"
type rules. The expand function in Snakemake assumes that the list of
wildcard combinations is always the full cross-product. In our case, this
is not the case, as samples have different number of batches.
Therefore we define a few custom combinators below
'''
'''
This combinator assumes that all wildcard variables are simply zipped
together. This is designed for the sample and batch variables.
'''
def zip_combinator(*args, **kwargs):
# All samples and batches
for i in range(len(args[0])):
yield tuple(args[j][i] for j in range(len(args)))
'''
This combinator zips the first two arguments, and combines those with the
third argument. We use this to create all sample-batch-mtype combinations.
'''
def zip2_comb3_combinator(*args, **kwargs):
# First two wildcards
for i in range(len(args[0])):
# Combine with third wildcard
for j in range(len(args[2])):
# wc1 - wc2 - wc3
yield args[0][i], args[1][i], args[2][j]
'''
##############################################################################
# Splitting fast5 files into batches for parallel processing computation
##############################################################################
The input directory raw/samplename/guppy/ will be searched for fast5 files.
The list of fast5 files per sample is split into evenly sized batches.
The directory raw/samplename/batched will contain a subdirectory for each
batch.
CAVEAT: The fast5 files will be **symlinked** from the batch directory to the
basecalled directory. Do NOT delete the fast5 files from the basecalled
directory!
Rule will not be performed if "batched" directory exists. Delete the "batched"
directory, if you want to redo batch splitting.
This rule will be performed locally, as it is only creating symlinks and is
not computationally expensive.
'''
def split_batches_from_file_list(all_files, outdir):
# Create "batched" directory if it doesn't exist
if not os.path.exists(outdir):
os.mkdir(outdir)
i = 0
b = 0
while i < len(all_files):
batchdir = os.path.join(outdir, '%d' % b)
os.mkdir(batchdir)
for _ in range(per_batch):
if i == len(all_files):
break
link = os.path.join(batchdir, '%s' % os.path.basename(all_files[i]))
os.symlink(all_files[i], link)
i += 1
b += 1
def split_batches_input(wildcards):
#multi_file_path = os.path.join(basedir, 'raw', wildcards['sample'], 'multi', '{fname}.fast5')
file_path = os.path.join(basedir, 'raw', wildcards['sample'], 'guppy', '{fname}.fast5')
return expand(file_path, fname=glob_wildcards(file_path).fname)
rule split_batches:
input: split_batches_input
output: os.path.join(basedir, 'raw', '{sample}', 'batched', 'done')
params:
outdir=os.path.join(basedir, 'raw', '{sample}', 'batched')
run:
split_batches_from_file_list(input, params.outdir)
with open(output[0], 'w') as fp:
pass
def split_batches_from_fastq_input(wildcards):
file_path = os.path.join(basedir, 'fastq', wildcards['sample'], 'guppy', '{fname}.%s' % fastq_ending)
return expand(file_path, fname=glob_wildcards(file_path).fname)
rule split_batches_from_fastq:
input: split_batches_from_fastq_input
output: os.path.join(basedir, 'fastq', '{sample}', 'batched', 'done')
params:
outdir=os.path.join(basedir, 'fastq', '{sample}', 'batched')
run:
split_batches_from_file_list(input, params.outdir)
with open(output[0], 'w') as fp:
pass
rule all_split_batches:
input: expand(rules.split_batches.output, sample=unique_samples)
rule all_split_batches_from_fastq:
input: expand(rules.split_batches_from_fastq.output, sample=unique_samples)
include: 'rules/fastq.rules'
include: 'rules/guppy.rules'
include: 'rules/mapping.rules'
include: 'rules/nanopolish.rules'
include: 'rules/pycoqc.rules'
if "nanonome" in globals():
include: "rules/nanonome.rules"
if "bonito" in globals():
include: "rules/bonito.rules"
if "accessibility_model" in globals():
include: 'rules/accessibility_prediction.rules'
else:
print("accessibility_model not defined in config. Support for accessibility prediction disabled")
if "whatshap" in globals():
include: "rules/whatshap.rules"
if "pycometh" in globals():
include: "rules/pycometh.rules"
if "custom_rules" in globals().keys():
print(custom_rules)
for f in Path("rules.d").iterdir():
if f.name.endswith(".rules"):
if f.name in custom_rules:
include: f
'''
##############################################################################
# Reports methylation histograms - only really useful for benchmarking
# datasets where you know the methylation rate (like fully methylated or
# fully unmethylated datasets)
##############################################################################
'''
def report_methylation_input(wildcards):
return expand(rules.metcall.output, batch=samplebatches[wildcards.sample],
sample=wildcards.sample, mtype=wildcards.mtype)
rule report_methylation:
input: report_methylation_input
output: os.path.join(basedir, 'report/{sample}_{mtype}.pdf')
params:
jobname='report_{sample}_{mtype}',
runtime='0:10',
memusage='4000',
slots='1',
misc=''
shell: '{python} report_met.py ' + os.path.join(basedir, 'met') + \
' {output} {wildcards.sample} {wildcards.mtype}'
rule all_report_methylation:
input: expand(rules.report_methylation.output, mtype=mettypes, sample=unique_samples)
localrules: prepare_mergebams, split_batches, split_batches_from_fastq