-
Notifications
You must be signed in to change notification settings - Fork 0
/
basemods_spark_runner_yarn_cluster.py
1902 lines (1609 loc) · 75.3 KB
/
basemods_spark_runner_yarn_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python
# coding=utf-8
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.storagelevel import StorageLevel
from subprocess import Popen, PIPE
from itertools import groupby
from pbcore.io import BasH5Reader, CmpH5Reader
import h5py
import shlex
import os
import numpy
import hashlib
import socket
import time
import random
import shutil
import xml.etree.ElementTree as ET
import pickle
# parameters_config = 'parameters.conf'
rsi_pkl = 'ref_splitting_info.pkl'
acec_pkl = 'atomchunk2enlargedchunk.pkl'
H5GROUP = h5py._hl.group.Group
H5DATASET = h5py._hl.dataset.Dataset
# for change file name
SPACE_ALTER = "_"
# contig attri names
SEQUENCE = 'sequence'
SEQUENCE_LENGTH = 'seqLength'
SEQUENCE_MD5 = 'md5'
refMaxLength = 3e12
COLUMNS = 60
PAD = 15
max_numpartitions = 10000
reads_shuffle_factor = 3
# sleep seconds when get data from master node/HDFS
MAX_SLEEP_SECONDS = 100
# for split reference to multi chunks
max_chunk_length = 25000
max_reads_per_chunk = 5000
limitation_readsnum = max_reads_per_chunk * 5
# dir for save the results in HDFS
HDFS_IPDINFO_DIR = '/fastipd'
HDFS_SAMIPDINFO_DIR = '/samipd'
HDFS_MODS_DIR = '/mods'
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# read configfile to get parameters------------------------------------------
def getParametersFromFile():
# conf = ConfigParser.ConfigParser()
# conf.read(config_file)
global shell_script_baxh5
global shell_script_cmph5
global shell_script_mods
global shell_script_sa
global TEMP_OUTPUT_FOLDER
global SMRT_ANALYSIS_HOME
global DATA_SAVE_MODE
global REFERENCE_DIR
global REF_FILENAME
global REF_SA_FILENAME
global CELL_DATA_DIR
global HDFS_CMD
global PROC_NUM
global BAXH5_FOLDS
global READS_TRIM_STRATEGY
global IPDMAXCOVERAGE
global METHYLATION_TYPES
global GET_IPD_FROM_BASH5
global GET_IPD_FROM_CMPH5
global SPARK_EXECUTOR_MEMORY
global SPARK_TASK_CPUS
global SPARK_MEMORY_FRACTION
global SPARK_MEMORY_STORAGEFRACTION
global SPARK_EXECUTOR_CORES
global SPARK_EXECUTOR_INSTANCES
# set the file path of these four scripts---------------
# NOTE that these scripts must be executable. If not, use 'chmod +x'
SCRIPTS_FOLDER = '/home/hadoop/workspace_py/basemods_spark/scripts'
shell_script_baxh5 = SCRIPTS_FOLDER + '/baxh5_operations.sh'
shell_script_cmph5 = SCRIPTS_FOLDER + '/cmph5_operations.sh'
shell_script_mods = SCRIPTS_FOLDER + '/mods_operations.sh'
shell_script_sa = SCRIPTS_FOLDER + '/exec_sawriter.sh'
# [FilePath]
# directory for storing the temp data in the master node and each worker node
# Note that the dir must have enough space.
TEMP_OUTPUT_FOLDER = '/tmp/basemods_spark_data'
# the path of SMRT-Analysis in each worker node
# for now, we are using SMRT-Analysis V2.3.0 (smrtanalysis_2.3.0.140936)
SMRT_ANALYSIS_HOME = '/home/hadoop/smrtanalysis'
# choose to put the cell data into your master node or HDFS
# in yarn cluster mode, just use "HDFS"
DATA_SAVE_MODE = 'HDFS'
# the reference file directory in HDFS
REFERENCE_DIR = 'hdfs://127.0.0.1:9000/data/pacbio/lambda_v210'
REF_FILENAME = 'lambda.fasta'
# please put .sa file in the same directory (REFERENCE_DIR) of your reference file (REF_FILENAME)
# if there is no .sa file, assign "None" to REF_SA_FILENAME
REF_SA_FILENAME = 'lambda.fasta.sa'
# the smrt cell data directory in HDFS
CELL_DATA_DIR = 'hdfs://127.0.0.1:9000/data/pacbio/lambda_v210'
# [PipelineArgs]
# the location of the 'hdfs' shell script
HDFS_CMD = '/usr/local/hadoop/bin/hdfs'
# the number of processors you allow SMRT-Analysis to use in each worker node
# It's ok to set PROC_NUM to 39 if each worker node has 40 processors.
PROC_NUM = 3
# the folds of each bax.h5 file you want to split to
BAXH5_FOLDS = 1
# for now it is USELESS
# strategy for trimming reads in repeats region
# "random" or "mapqv"
READS_TRIM_STRATEGY = 'random'
# maxCoverage in ipdSummary.py
IPDMAXCOVERAGE = 250
# methylation types to be identified, for now there are three kinds: "m6A,m5C,m4C"
# Use ',' as delimiter.
METHYLATION_TYPES = 'm6A, m4C'
METHYLATION_TYPES = trim_spaces(METHYLATION_TYPES)
# whether to write IPD value to file or not
# "YES" or "NO"
GET_IPD_FROM_BASH5 = 'NO'
GET_IPD_FROM_CMPH5 = 'NO'
# should be no greater than the memory of a worker node
SPARK_EXECUTOR_MEMORY = '4g'
# Number of cores (identical to virtual cores/cpu processors) to allocate for each task.
# 2, 4 or 5?
SPARK_TASK_CPUS = '1'
# default value in Spark Configuration is 0.6
SPARK_MEMORY_FRACTION = '0.6'
# default value in Spark Configuration is 0.5
SPARK_MEMORY_STORAGEFRACTION = '0.5'
# only for Spark on YARN mode------------------------------------
# for now, this parameter should be equal to the number of worker nodes
# e.g. Suppose you have 5 worker nodes, set the number of executor instances to 5.
SPARK_EXECUTOR_INSTANCES = '1'
# this parameter should better be equal to (the number of cpu cores each worker node has - 1).
# e.g. Suppose you have 64 cores (identical to virtual cores/cpu processors) in each worker node,
# set the number of executor cores to 63.
# OR, set it to the number of "yarn.nodemanager.resource.cpu-vcores"/"yarn.scheduler.maximum-allocation-vcores"
# in $HADOOP_HOME/etc/hadoop/yarn-site.xml.
SPARK_EXECUTOR_CORES = '4'
# ---------------------------------------------------------------
return
# for trim spaces
def trim_spaces(ori_str):
return str(ori_str).strip().replace(", ", ",")
# fasta_info.py-------------------------
def getRefInfoFromFastaFiles(filepaths):
"""
:param filepaths: list of filepath
:return:
"""
contiginfo_dict = {}
for fp in filepaths:
contiginfo_dict.update(_getRefInfoFromFastaFile(fp))
return contiginfo_dict
def _getRefInfoFromFastaFile(filepath):
contiginfos = FastaInfo(filepath).getContigs()
contiginfo_dict = {}
for contiginfo in contiginfos:
# contiginfo_dict[contiginfo.getContigName()] = contiginfo
contiginfo_dict[contiginfo.getContigName()] = {}
contiginfo_dict[contiginfo.getContigName()][SEQUENCE] = contiginfo.getSequence()
contiginfo_dict[contiginfo.getContigName()][SEQUENCE_LENGTH] = contiginfo.getSeqLength()
contiginfo_dict[contiginfo.getContigName()][SEQUENCE_MD5] = contiginfo.getMd5()
del contiginfos
return contiginfo_dict
class FastaInfo:
def __init__(self, filepath):
self._contigs = [] # list of ContigInfo()
with open(filepath, mode='r') as rf:
contigTmp = ContigInfo()
firstline = next(rf)
if not str(firstline).startswith(">"):
print("read fasta file wrong!")
raise ValueError
else:
contigTmp.setContigName(firstline.strip()[1:])
sequencetmp = ""
for line in rf:
if line.startswith(">"):
contigTmp.setSequence(sequencetmp)
contigTmp.setSeqLength(len(sequencetmp))
contigTmp.setMd5(hashlib.md5(sequencetmp).hexdigest())
self._contigs.append(contigTmp)
sequencetmp = ""
contigTmp = ContigInfo()
contigTmp.setContigName(line.strip()[1:])
else:
sequencetmp += line.strip()
# the last contig
contigTmp.setSequence(sequencetmp)
contigTmp.setSeqLength(len(sequencetmp))
contigTmp.setMd5(hashlib.md5(sequencetmp).hexdigest())
self._contigs.append(contigTmp)
def getContigs(self):
return self._contigs
class ContigInfo:
def __init__(self):
self._contigName = ""
self._sequence = ""
self._seqLength = 0
self._md5 = ""
def getContigName(self):
return self._contigName
def setContigName(self, contigname):
self._contigName = contigname
def getSequence(self):
return self._sequence
def setSequence(self, sequence):
self._sequence = sequence
def getSeqLength(self):
return self._seqLength
def setSeqLength(self, seqlength):
self._seqLength = seqlength
def getMd5(self):
return self._md5
def setMd5(self, md5):
self._md5 = md5
# CmpH5Format.py-----------------------
class CmpH5Format:
# def __init__(self, cmpH5):
def __init__(self):
# if ('Version' in cmpH5.attrs):
# self.VERSION = cmpH5.attrs['Version']
self.ALN_INFO = 'AlnInfo'
self.REF_INFO = 'RefInfo'
self.MOVIE_INFO = 'MovieInfo'
self.REF_GROUP = 'RefGroup'
self.ALN_GROUP = 'AlnGroup'
self.ALN_INDEX_NAME = 'AlnIndex'
self.FILE_LOG = 'FileLog'
self.BARCODE_INFO = 'BarcodeInfo'
self.ALN_INDEX = '/'.join([self.ALN_INFO, self.ALN_INDEX_NAME])
self.REF_GROUP_ID = '/'.join([self.REF_GROUP, 'ID'])
self.REF_GROUP_PATH = '/'.join([self.REF_GROUP, 'Path'])
self.REF_GROUP_INFO_ID = '/'.join([self.REF_GROUP, 'RefInfoID'])
self.REF_OFFSET_TABLE = '/'.join([self.REF_GROUP, 'OffsetTable'])
self.ALN_GROUP_ID = '/'.join([self.ALN_GROUP, 'ID'])
self.ALN_GROUP_PATH = '/'.join([self.ALN_GROUP, 'Path'])
# Movie Info
self.MOVIE_INFO_ID = '/'.join([self.MOVIE_INFO, 'ID'])
self.MOVIE_INFO_NAME = '/'.join([self.MOVIE_INFO, 'Name'])
self.MOVIE_INFO_EXP = '/'.join([self.MOVIE_INFO, 'Exp'])
self.MOVIE_INFO_FRAME_RATE = '/'.join([self.MOVIE_INFO, 'FrameRate'])
self.MOVIE_INFO_RUN = '/'.join([self.MOVIE_INFO, 'Run'])
self.MOVIE_INFO_BINDING_KIT = '/'.join([self.MOVIE_INFO, 'BindingKit'])
self.MOVIE_INFO_SEQUENCING_KIT = '/'.join([self.MOVIE_INFO, 'SequencingKit'])
self.MOVIE_INFO_SOFTWARE_VERSION = '/'.join([self.MOVIE_INFO, 'SoftwareVersion'])
(self.ID, self.ALN_ID, self.MOVIE_ID, self.REF_ID, self.TARGET_START,
self.TARGET_END, self.RC_REF_STRAND, self.HOLE_NUMBER, self.SET_NUMBER,
self.STROBE_NUMBER, self.MOLECULE_ID, self.READ_START, self.READ_END,
self.MAP_QV, self.N_MATCHES, self.N_MISMATCHES, self.N_INSERTIONS,
self.N_DELETIONS, self.OFFSET_BEGIN, self.OFFSET_END, self.N_BACK,
self.N_OVERLAP) = range(0, 22)
# self.extraTables = ['/'.join([self.ALN_INFO, x]) for x in
# cmpH5[self.ALN_INFO].keys()
# if not x == self.ALN_INDEX_NAME]
# sorting
self.INDEX_ATTR = "Index"
self.INDEX_ELTS = ['REF_ID', 'TARGET_START', 'TARGET_END']
# movie_chemistry.py-----------------
def getMoviesChemistry(filepaths):
"""
:param filepaths: list of filepath
:return:
"""
moviestriple = {}
for filepath in filepaths:
moviestriple.update(MovieChemistry(filepath).getMovieTriple())
return moviestriple
class ChemistryLookupError(Exception):
pass
class MovieChemistry:
def __init__(self, filepath):
self._movietriple = {}
if str(filepath).endswith(".xml"):
moviename = str(filepath).split("/")[-1].split(".")[0]
triple = self.tripleFromMetadataXML(filepath)
tripledict = dict()
# tripledict['Name'] = moviename
tripledict['BindingKit'] = triple[0]
tripledict['SequencingKit'] = triple[1]
tripledict['SoftwareVersion'] = triple[2]
self._movietriple[moviename] = tripledict
else:
pass
def getMovieTriple(self):
return self._movietriple
def tripleFromMetadataXML(self, metadataXmlPath):
"""
from pbcore.io.BasH5IO
Scrape the triple from the metadata.xml, or exception if the file
or the relevant contents are not found
"""
nsd = {None: "http://pacificbiosciences.com/PAP/Metadata.xsd",
"pb": "http://pacificbiosciences.com/PAP/Metadata.xsd"}
try:
tree = ET.parse(metadataXmlPath)
root = tree.getroot()
bindingKit = root.find("pb:BindingKit/pb:PartNumber", namespaces=nsd).text
sequencingKit = root.find("pb:SequencingKit/pb:PartNumber", namespaces=nsd).text
# The instrument version is truncated to the first 3 dot components
instrumentControlVersion = root.find("pb:InstCtrlVer", namespaces=nsd).text
verComponents = instrumentControlVersion.split(".")[0:2]
instrumentControlVersion = ".".join(verComponents)
return (bindingKit, sequencingKit, instrumentControlVersion)
except Exception as e:
raise ChemistryLookupError, \
("Could not find, or extract chemistry information from, %s" % (metadataXmlPath,))
# bash5_splitting.py-------------------
def split_holenumbers(holenumbers, folds=1):
holenumbers_len = len(holenumbers)
if folds > holenumbers_len:
folds = holenumbers_len
onefoldlen_base = holenumbers_len / folds
fold_yu = holenumbers_len % folds
hole_splitspots = [onefoldlen_base] * folds
hole_splitspots[0] += fold_yu
endOffset = numpy.cumsum(hole_splitspots)
beginOffset = numpy.hstack(([0], endOffset[0:-1]))
offsets = zip(beginOffset, endOffset)
return offsets
def makeOffsetsDataStructure(baxh5obj):
"""
from pbcore.io.BasH5IO.py
:param baxh5obj:
:return:
"""
numEvent = baxh5obj["/PulseData/BaseCalls/ZMW/NumEvent"].value
holeNumber = baxh5obj["/PulseData/BaseCalls/ZMW/HoleNumber"].value
endOffset = numpy.cumsum(numEvent)
beginOffset = numpy.hstack(([0], endOffset[0:-1]))
offsets = zip(beginOffset, endOffset)
return dict(zip(holeNumber, offsets))
def get_movieName(baxh5file):
"""
from pbcore.io.BasH5IO.py.movieName
:param baxh5file:
:return:
"""
movieNameAttr = baxh5file["/ScanData/RunInfo"].attrs["MovieName"]
# In old bas.h5 files, attributes of ScanData/RunInfo are stored as
# strings in arrays of length one.
if (isinstance(movieNameAttr, (numpy.ndarray, list)) and
len(movieNameAttr) == 1):
movieNameString = movieNameAttr[0]
else:
movieNameString = movieNameAttr
if not isinstance(movieNameString, basestring):
raise TypeError("Unsupported movieName {m} of type {t}."
.format(m=movieNameString,
t=type(movieNameString)))
return movieNameString
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------------
# calculate .fasta.sa file --------------------------------------------
def exec_sawriter(sa_script_path, ref_fasta_filepath):
sa_operations = "{sa_script} {seymour_home} {ref_fasta_file}".\
format(sa_script=sa_script_path,
seymour_home=SMRT_ANALYSIS_HOME,
ref_fasta_file=ref_fasta_filepath)
sa_process = Popen(sa_operations, stdout=PIPE, stderr=PIPE, shell=True)
sa_out, sa_error = sa_process.communicate(sa_operations)
if "[Errno" in sa_error.strip() or "error" in sa_error.strip().lower():
raise ValueError("sa process failed to complete!\n"
"try again or use blasr.sawriter to generate the fasta.sa file of your"
"REFERENCE(.fasta) file manually\n"
"(Error)\n stdout: {} \n stderr: {}".
format(sa_out, sa_error))
if sa_process.returncode != 0:
raise ValueError("sa process failed to complete!\n"
"try again or use blasr.sawriter to generate the fasta.sa file of your"
"REFERENCE(.fasta) file manually\n"
"(Non-zero return code)\n stdout: {} \n"
" stderr: {}".format(sa_out, sa_error))
else:
print("\nsa process logging:\n stdout:{} \n".format(sa_out))
return os.path.basename(ref_fasta_filepath) + ".sa"
def rename(oriname):
return oriname.replace(' ', SPACE_ALTER)\
.replace('/', SPACE_ALTER)\
.replace(':', SPACE_ALTER)\
.replace(';', SPACE_ALTER)
# name ref contig file
def name_reference_contig_file(ref_name, contigname):
ref_prefix, ref_ext = os.path.splitext(ref_name)
contigname = rename(contigname)
return ref_prefix + '.' + contigname + ref_ext
# for write ipd info --------------------------------------------------
def asciiFromQvs(a):
return (numpy.clip(a, 0, 93).astype(numpy.uint8) + 33).tostring()
def zmwReads(inBasH5, readType='subreads'):
"""
Extract all reads of the appropriate read type
"""
for zmw in inBasH5:
if readType == "ccs":
r = zmw.ccsRead
if r:
yield r
elif readType == "unrolled":
yield zmw.read()
else:
for r in zmw.subreads:
yield r
def write_ipd_of_bash5(inBasH5File, outIpdInfoFile):
DELIMITER1 = "@"
DELIMITER2 = "+quality value"
DELIMITER3 = "+IPD value"
start = time.time()
inBasH5 = BasH5Reader(inBasH5File)
issuccess = 0
try:
outIpdInfo = open(outIpdInfoFile, 'w')
for zmwRead in zmwReads(inBasH5):
readinfotmp = "\n".join([DELIMITER1 + zmwRead.readName,
zmwRead.basecalls(),
DELIMITER2,
asciiFromQvs(zmwRead.QualityValue()),
DELIMITER3,
" ".join(map(str, zmwRead.IPD()))])
outIpdInfo.write(readinfotmp + '\n')
outIpdInfo.flush()
outIpdInfo.close()
print("the IPD info of {} has been extracted and written.\ncost {} seconds"
.format(inBasH5File, time.time() - start))
issuccess = 1
except IOError:
print("IOError while writing {}".format(outIpdInfoFile))
finally:
return issuccess
def get_ipdvalue_of_h5_to_hdfs(inH5File, hdfs_data_dir, h5Type="BAXH5", max_sleep_seconds=1):
"""
:param inH5File:
:param hdfs_data_dir:
:param h5Type: "BAXH5" or "CMPH5"
:param max_sleep_seconds:
:return:
"""
if h5Type == "BAXH5":
name_prefix = os.path.basename(inH5File).split(".bax.h5")[0]
dirpath = os.path.dirname(inH5File)
outIpdInfoFileName = name_prefix + ".fastipd"
outIpdInfoFile = '/'.join([dirpath, outIpdInfoFileName])
wissuccess = write_ipd_of_bash5(inH5File, outIpdInfoFile)
elif h5Type == "CMPH5":
name_prefix = os.path.basename(inH5File).split(".cmp.h5")[0]
outIpdInfoFileName = name_prefix + ".samipd"
dirpath = os.path.dirname(inH5File)
outIpdInfoFile = '/'.join([dirpath, outIpdInfoFileName])
wissuccess = write_ipd_of_cmph5(inH5File, outIpdInfoFile)
else:
print("arg h5Type is not set rightly")
return inH5File
if wissuccess:
cmd_output, cmd_errors = run_cmd_safe([HDFS_CMD, 'dfs', '-copyFromLocal', '-f',
outIpdInfoFile,
'/'.join([hdfs_data_dir, outIpdInfoFileName])],
max_sleep_seconds)
os.remove(outIpdInfoFile) # rm temp ipdinfo file
else:
print("failed to write {}".format(outIpdInfoFile))
return inH5File
def get_baxh5file_from_hdfs(hdfs_filepath, local_temp_dir, max_sleep_seconds=1):
if not os.path.isdir(local_temp_dir):
try:
os.mkdir(local_temp_dir, 0777)
except:
print('local temp directory {} exists.'.format(local_temp_dir))
filename = os.path.basename(hdfs_filepath)
local_filepath = '/'.join([local_temp_dir, filename])
cmd_output, cmd_errors = run_hdfs_get_cmd(hdfs_filepath, local_filepath,
max_sleep_seconds)
if "[Errno" in cmd_errors.strip() or "error" in cmd_errors.strip().lower():
raise RuntimeError("hdfs get process failed to complete! (Error)\n stdout: {} \n stderr: {}".
format(cmd_output, cmd_errors))
else:
# print("\nhdfs get process logging:\n stdout:{} \n".format(cmd_output))
# ----transmit ipd value to HDFS
# FIXME: need to delete this when it is useless
if str(GET_IPD_FROM_BASH5).lower() == 'yes':
# hdfs_dir = os.path.dirname(hdfs_filepath)
hdfs_dir = CELL_DATA_DIR + HDFS_IPDINFO_DIR
get_ipdvalue_of_h5_to_hdfs(local_filepath, hdfs_dir, "BAXH5")
return local_filepath
# convert baxh5 to list------------------------------------------------
def get_chunks_of_baxh5file(baxh5file, folds=1):
f = h5py.File(baxh5file, "r")
holenumbers = f['/PulseData/BaseCalls/ZMW/HoleNumber'].value
if folds > len(holenumbers):
folds = len(holenumbers)
elif folds < 1:
folds = 1
hole_splitspots = split_holenumbers(holenumbers, folds)
hole2range = makeOffsetsDataStructure(f) # todo: how to make it faster
basecall_splitspots = get_basecall_range_of_each_holesblock(hole_splitspots,
holenumbers, hole2range)
moviename = get_movieName(f)
chunk_data_info = []
# datasets in PulseData/BaseCalls/ZMW
# FIXME: use (for key in f['']) or (for key in f[''].keys())?
for key in f['/PulseData/BaseCalls/ZMW']:
chunk_data_info.extend(get_chunks_in_zmw_info(hole_splitspots, holenumbers,
'/PulseData/BaseCalls/ZMW/' + str(key),
moviename))
# datasets in PulseData/BaseCalls/ZMWMetrics
for key in f['/PulseData/BaseCalls/ZMWMetrics']:
chunk_data_info.extend(get_chunks_in_zmw_info(hole_splitspots, holenumbers,
'/PulseData/BaseCalls/ZMWMetrics/' + str(key),
moviename))
# datasets in PulseData/BaseCalls
for key in f['/PulseData/BaseCalls']:
if not (str(key) == 'ZMWMetrics' or str(key) == 'ZMW'):
chunk_data_info.extend(get_chunks_in_basecalls_info(hole_splitspots, holenumbers,
basecall_splitspots,
'/PulseData/BaseCalls/' + str(key),
moviename))
# PulseData/Regions
chunk_data_info.extend(get_chunks_in_region_info(hole_splitspots, holenumbers, f,
'/PulseData/Regions', moviename))
# baxh5attrs = get_baxh5_attrs(f)
f.close()
chunk_data_group = group_folds_of_one_baxh5file(chunk_data_info)
del chunk_data_info
print('done splitting {} to {} chunk(s)'.format(baxh5file, folds))
return map(lambda x: add_each_fold_the_filepath(x, baxh5file),
chunk_data_group)
def add_each_fold_the_filepath(x, baxh5filepath):
return x[0], (baxh5filepath, x[1])
def group_folds_of_one_baxh5file(holerange_data):
group_holerange_data = []
for key, group in groupby(sorted(holerange_data), lambda x: x[0]):
data_set = []
for h5data in group:
data_set.append(h5data[1])
group_holerange_data.append((key, data_set))
return group_holerange_data
def get_chunks_in_zmw_info(hole_splitspots,
holenumbers, datasetname, moviename):
chunks_info = []
for holess in hole_splitspots:
chunks_info.append(((moviename, (holenumbers[holess[0]], holenumbers[holess[1] - 1])),
(datasetname, (holess[0], holess[1]))))
return chunks_info
def get_chunks_in_basecalls_info(hole_splitspots, holenumbers,
basecall_splitspots, datasetname, moviename):
chunks_info = []
for i in range(0, len(hole_splitspots)):
chunks_info.append(((moviename, (holenumbers[hole_splitspots[i][0]],
holenumbers[hole_splitspots[i][1] - 1])),
(datasetname, (basecall_splitspots[i][0],
basecall_splitspots[i][1]))))
return chunks_info
def get_chunks_in_region_info(hole_splitspots,
holenumbers, f, datasetname, moviename):
regiondata = f[datasetname]
sshape = regiondata.shape
chunks_info = []
locs_start = 0
for holess in hole_splitspots[:-1]:
holenumbers_tmp = set(holenumbers[holess[0]:holess[1]])
for i in range(locs_start, sshape[0]):
if regiondata[i, 0] not in holenumbers_tmp:
chunks_info.append(((moviename, (holenumbers[holess[0]], holenumbers[holess[1] - 1])),
(datasetname, (locs_start, i))))
locs_start = i
break
chunks_info.append(((moviename, (holenumbers[hole_splitspots[-1][0]],
holenumbers[hole_splitspots[-1][1] - 1])),
(datasetname, (locs_start, sshape[0]))))
return chunks_info
def get_basecall_range_of_each_holesblock(hole_splitspots, holenumbers, holerange):
basecall_splitspots = []
for hole_splitspot in hole_splitspots:
begin, end = holerange[holenumbers[hole_splitspot[0]]][0], \
holerange[holenumbers[hole_splitspot[1] - 1]][1]
basecall_splitspots.append((begin, end))
return basecall_splitspots
# FIXME: 1. how to get a list of all the datasets and groups of an h5file at once?
# FIXME: h5obj.visit(printname) can only print the list
# FIXME: 2. how to check a item in h5obj is a group or a dataset?
# get baxh5 attrs------------------------------------------------------
def get_baxh5_attrs(baxh5obj):
attrslist = []
# ScanData
attrslist.append((('/ScanData', H5GROUP), get_h5item_attrs(baxh5obj, '/ScanData')))
for key in baxh5obj['/ScanData'].keys():
pathtmp = '/ScanData/' + str(key)
if isinstance(baxh5obj[pathtmp], H5GROUP):
attrslist.append(((pathtmp, H5GROUP), get_h5item_attrs(baxh5obj, pathtmp)))
else:
attrslist.append(((pathtmp, H5DATASET), get_h5item_attrs(baxh5obj, pathtmp)))
# PulseData
attrslist.append((('/PulseData/BaseCalls', H5GROUP),
get_h5item_attrs(baxh5obj, '/PulseData/BaseCalls')))
attrslist.append((('/PulseData/Regions', H5DATASET),
get_h5item_attrs(baxh5obj, '/PulseData/Regions')))
return attrslist
def get_h5item_attrs(h5obj, itempath):
attrslist = []
itemattrs = h5obj[itempath].attrs
for key, val in itemattrs.items():
attrslist.append((key, val))
return attrslist
# operations for each rdd element in baxh5RDD-------------------------
def basemods_pipeline_baxh5_operations(keyval):
"""
keyval: an element of baxh5RDD
((moviename, holerange), (filepath,
[(datasetname, (dataset_begin_spot, dataset_end_spot)),...]))
:param keyval:
:return: aligned reads
"""
fileinfo, filecontent = keyval
reference_path = SparkFiles.get(REF_FILENAME)
referencesa_path = SparkFiles.get(USED_REF_SA_FILENAME)
baxh5_shell_file_path = SparkFiles.get(shell_script_baxh5)
if not os.path.isdir(TEMP_OUTPUT_FOLDER):
try:
os.mkdir(TEMP_OUTPUT_FOLDER, 0777)
except:
print('temp directory {} exists.'.format(TEMP_OUTPUT_FOLDER))
if BAXH5_FOLDS == 1:
baxh5path = filecontent[0]
name_prefix = os.path.basename(baxh5path).split('.bax.h5')[0]
else:
name_prefix = (fileinfo[0] + "." + str(fileinfo[1][0]) + "-" + str(fileinfo[1][1])).replace(' ', SPACE_ALTER)
baxh5file = name_prefix + ".bax.h5"
baxh5_dir = TEMP_OUTPUT_FOLDER
baxh5path = baxh5_dir + '/' + baxh5file
writebaxh5(filecontent, baxh5path)
cmph5file = name_prefix + ".aligned_reads.cmp.h5"
# baxh5 operations (filter, align(blasr, filter, samtoh5), loadchemistry, loadpulse)
baxh5_operations = "{baxh5_operations_sh} {seymour_home} {temp_output_folder} {baxh5_filepath}" \
" {reference_filepath} {referencesa_filepath} {cmph5_filename} {proc_num}". \
format(baxh5_operations_sh=baxh5_shell_file_path,
seymour_home=SMRT_ANALYSIS_HOME,
temp_output_folder=TEMP_OUTPUT_FOLDER,
baxh5_filepath=baxh5path,
reference_filepath=reference_path,
referencesa_filepath=referencesa_path,
cmph5_filename=cmph5file,
proc_num=PROC_NUM)
baxh5_process = Popen(shlex.split(baxh5_operations), stdout=PIPE, stderr=PIPE)
baxh5_out, baxh5_error = baxh5_process.communicate()
if "[Errno" in baxh5_error.strip() or "error" in baxh5_error.strip().lower():
raise ValueError("baxh5 process failed to complete! (Error)\n stdout: {} \n stderr: {}".
format(baxh5_out, baxh5_error))
if baxh5_process.returncode != 0:
raise ValueError("baxh5 process failed to complete! (Non-zero return code)\n stdout: {} \n"
" stderr: {}".format(baxh5_out, baxh5_error))
else:
print("\nbaxh5 process logging:\n stdout:{} \n".format(baxh5_out))
cmph5_filepath = '/'.join([TEMP_OUTPUT_FOLDER, cmph5file])
# write ipdvalue of cmph5 file ---
if str(GET_IPD_FROM_CMPH5).lower() == 'yes':
if DATA_SAVE_MODE == 'HDFS':
# hdfs_dir = os.path.dirname(hdfs_filepath)
hdfs_dir = CELL_DATA_DIR + HDFS_SAMIPDINFO_DIR
get_ipdvalue_of_h5_to_hdfs(cmph5_filepath, hdfs_dir, "CMPH5")
else:
pass
# --------------------------------
aligned_reads = split_reads_in_cmph5(cmph5_filepath)
# rm temp files --------
os.remove(baxh5path)
remove_files_in_a_folder(TEMP_OUTPUT_FOLDER, name_prefix)
# ----------------------
return aligned_reads
def write_ipd_of_cmph5(inCmpH5File, outIpdInfoFile):
start = time.time()
inCmpH5 = CmpH5Reader(inCmpH5File)
issuccess = 0
try:
outIpdInfo = open(outIpdInfoFile, 'w')
outIpdInfo.write("#readID\trefID\trefStart\trefEnd\trefStrand\tMapQV\trefSeq\treadSeq\treadIPD\n")
for alignment in inCmpH5:
outIpdInfo.write('\t'.join([alignment.readName, alignment.referenceName,
str(alignment.tStart),
str(alignment.tEnd), str(alignment.RCRefStrand),
str(alignment.mapQV),
alignment.reference(), alignment.read(),
" ".join(map(str, alignment.IPD()))]) + "\n")
outIpdInfo.flush()
outIpdInfo.close()
print("the IPD info of {} has been extracted and written.\ncost {} seconds"
.format(inCmpH5File, time.time() - start))
issuccess = 1
except IOError:
print("IOError while writing {}".format(outIpdInfoFile))
finally:
return issuccess
def writebaxh5(filecontent, filepath):
"""
a lighter way
:param filecontent:
:param filepath:
:return:
"""
wf = h5py.File(filepath, "w")
if isinstance(filecontent, tuple):
ori_h5_path, h5data = filecontent
else:
raise ValueError("the format of h5file info is wrong!")
rf = h5py.File(ori_h5_path, "r")
for datasetinfo in h5data:
datasetname, (lbegin, lend) = datasetinfo
sdata = rf[datasetname]
sshape = sdata.shape
if len(sshape) == 1:
wf.create_dataset(datasetname, data=sdata[lbegin:lend])
else:
wf.create_dataset(datasetname, data=sdata[lbegin:lend, :])
h5attr = get_baxh5_attrs(rf)
for itemattrinfo in h5attr:
item_name_type, itemattrs = itemattrinfo
if item_name_type[1] == H5GROUP:
itemtmp = wf.require_group(item_name_type[0])
else:
if item_name_type[0] in wf:
itemtmp = wf[item_name_type[0]]
else:
# FIXME: this is not recommended, all data of datasets
# FIXME: should be written in h5data (the last for loop)
itemtmp = wf.create_dataset(item_name_type[0], (1,))
for itemattr_name, itemattr_val in itemattrs:
itemtmp.attrs[itemattr_name] = itemattr_val
rf.close()
wf.close()
def split_reads_in_cmph5(cmph5_filepath):
"""
:param cmph5_filepath:
:return: info about each read, [(key, value), ...]
key: (reffullname, target_start, target_end)
value: dict(), {'/../AlnIndex': a line of AlnIndex, 'AlnArray': numpy.array,
'DeletionQV': numpy.array, ...}
"""
cmph5 = h5py.File(cmph5_filepath, 'r')
ch5Format = CmpH5Format()
aI = cmph5[ch5Format.ALN_INDEX]
aIlen = aI.shape[0]
refId2refFullName = refGroupId2RefInfoIdentifier(cmph5, ch5Format)
# refId2refMd5 = refGroupId2RefInfoIdentifier(cmph5, ch5Format, "MD5")
movieId2MovieInfo = MovieId2MovieInfo(cmph5, ch5Format)
reads_rdd = [None] * aIlen
for i in range(0, aIlen):
alnIndexTmp = aI[i, :]
read_key = (refId2refFullName[alnIndexTmp[ch5Format.REF_ID]],
alnIndexTmp[ch5Format.TARGET_START],
alnIndexTmp[ch5Format.TARGET_END])
read_val = dict()
# AlnIndex dataset
read_val[ch5Format.ALN_INDEX] = alnIndexTmp
alignInfoPath = cmph5[ch5Format.ALN_GROUP_PATH][numpy.where(
cmph5[ch5Format.ALN_GROUP_ID].value == alnIndexTmp[ch5Format.ALN_ID])[0][0]]
# align info datasets
offset_begin = alnIndexTmp[ch5Format.OFFSET_BEGIN]
offset_end = alnIndexTmp[ch5Format.OFFSET_END]
for key in cmph5[alignInfoPath].keys():
read_val[str(key)] = cmph5[alignInfoPath + '/' + str(key)][offset_begin:offset_end]
# movie info
movieid = alnIndexTmp[ch5Format.MOVIE_ID]
read_val.update(movieId2MovieInfo[movieid])
reads_rdd[i] = (read_key, read_val)
cmph5.close()
return reads_rdd
def refGroupId2RefInfoIdentifier(cmph5, ch5Format, refidentifier='FullName'):
"""
:param cmph5:
:param ch5Format:
:param refidentifier: 'FullName' or 'MD5'
:return:
"""
refID = cmph5[ch5Format.REF_GROUP_ID]
refInfoID = cmph5[ch5Format.REF_GROUP_INFO_ID]
_refInfoID = cmph5['/'.join([ch5Format.REF_INFO, 'ID'])]
refid = cmph5['/'.join([ch5Format.REF_INFO, refidentifier])]
refid2refid = {}
refshape = refID.shape
for i in range(0, refshape[0]):
refid2refid[refID[i]] = refid[numpy.where(_refInfoID.value == refInfoID[i])[0][0]]
return refid2refid
def MovieId2MovieInfo(cmph5, ch5Format, infonames="Name,FrameRate"):
"""
:param cmph5:
:param ch5Format:
:param infonames:names of datasets in MovieInfo, delimited by ","
"ID,Name,Exp,FrameRate,Run,BindingKit,SequencingKit,SoftwareVersion"
:return:
"""
movieinfo = cmph5[ch5Format.MOVIE_INFO]
movieid = movieinfo["ID"]
res = {}
for i in range(0, movieid.shape[0]):
res[movieid[i]] = {}
for datasetname in infonames.split(","):
res[movieid[i]]["/".join([ch5Format.MOVIE_INFO, datasetname.strip()])] = \
movieinfo[datasetname.strip()][0]
return res
# group reads of cmp.h5 file------------------------------------------
def _queueChunksForReference(numHits, refLength):
"""
from ipdsummary.py
Compute the chunk extents and queue up the work for a single reference
:param numHits: num of reads aligned to the ref
:param refLength: ref length
:return:
"""
# Number of hits on current reference
# refGroupId = refInfo.ID
# numHits = (self.cmph5.RefGroupID == refGroupId).sum()
# Don't process reference groups with 0 hits. They may not exist?
if numHits == 0:
return
# Maximum chunk size (set no larger than 1Mb for now)
MAX_BLOCK_SIZE = max_chunk_length
# Maximum number of hits per chunk
MAX_HITS = max_reads_per_chunk
# nBases = min(refInfo.Length, refMaxLength)
nBases = min(refLength, refMaxLength)
# Adjust numHits if we are only doing part of the contig
numHits = (numHits * nBases) / refLength
nBlocks = max([numHits / MAX_HITS, nBases / (MAX_BLOCK_SIZE - 1) + 1])
# Including nBases / (MAX_BLOCK_SIZE - 1) + 1 in nBlocks calculation:
# E. coli genome: this should be ~ 10.