-
Notifications
You must be signed in to change notification settings - Fork 43
/
crispor.py
executable file
·8916 lines (7485 loc) · 364 KB
/
crispor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/data/www/crispor/venv/bin/python3
# if you do not want the hardcoded PATH above, delete this line and the one above to use the default Python3 interpreter
#!/usr/bin/env python3
# I know that this line looks unprofessional to you, but modifying the PATH on a shared Apache webserver is not obvious.
# the tefor crispr tool
# can be run as a CGI or from the command line
# python std library
import subprocess, tempfile, optparse, logging, atexit, glob, shutil
import http.cookies, time, sys, cgi, re, random, platform, os, pipes
import hashlib, base64, string, logging, operator, urllib.request, urllib.parse, urllib.error, time
import traceback, json, pwd, gzip, zlib
from io import StringIO
from collections import defaultdict, namedtuple
from datetime import datetime
from itertools import product
from os.path import abspath, basename, dirname, isdir, isfile, join, relpath
try:
# prefer the pip package, it's more up-to-date than the native package
import pysqlite3 as sqlite3
except:
import sqlite3
try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict # python2.6 users: run 'sudo pip install ordereddict'
# for matplotlib, improves "import" performance
os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib-cache"
# try to load external dependencies
# we're going into great lengths to create a readable error message
needModules = set(["pytabix", "twobitreader", "pandas", "matplotlib", "scipy"])
try:
import tabix # if not found, install with 'pip install pytabix'
needModules.remove("pytabix")
except:
pass
try:
import twobitreader # if not found, install with 'pip install twobitreader'
needModules.remove("twobitreader")
except:
pass
try:
import pandas # required by doench2016 score. install with 'pip install pandas'
needModules.remove("pandas")
import scipy # required by doench2016 score. install with 'pip install scipy'
needModules.remove("scipy")
import matplotlib # required by doench2016 score. install with 'pip install matplotlib'
needModules.remove("matplotlib")
import numpy # required by doench2016 score. install with 'pip install numpy'
needModules.remove("numpy")
except:
pass
if len(needModules)!=0:
print("Content-type: text/html\n")
print(("Python interpreter path: %s<p>" % sys.executable))
print(("These python modules were not found: %s<p>" % ",".join(needModules)))
print("To install all requirements in one line, run: sudo pip install biopython numpy scikit-learn==0.16.1 pandas twobitreader<p>")
sys.exit(0)
# our own eff scoring library
import crisporEffScores
# don't report print as an error
# pylint: disable=E1601
# optional module for Excel export as native .xls files
# install with 'apt-get install python-xlwt' or 'pip install xlwt'
xlwtLoaded = True
try:
import xlwt
except:
sys.stderr.write("crispor.py - warning - the python xlwt module is not available\n")
xlwtLoaded = False
# optional module for mysql support
#try:
#import MySQLdb
#mysqldbLoaded = True
#except:
#mysqldbLoaded = False
# version of crispor
versionStr = "5.2"
# contact email
contactEmail='[email protected]'
# url to this server
ctBaseUrl = "http://crispor-max.tefor.net/temp/customTracks"
# write debug output to stdout
DEBUG = False
#DEBUG = True
# use bowtie for off-target search?
useBowtie = False
# calculate the efficienc scores?
doEffScoring = True
# system-wide temporary directory
#TEMPDIR = os.environ.get("TMPDIR", "/var/tmp")
TEMPDIR = "/var/tmp"
# a hack for cluster jobs at UCSC:
# - default to ramdisk
if isdir("/scratch/tmp"):
TEMPDIR = "/dev/shm/"
# skipAlign is useful if your input sequence is not in the genome at all
# - don't do bwasw
# - this will trigger auto-ontarget: any perfect match is the on-target
# - do not calculate efficiency scores
skipAlign = False
# prefix in html statements before the directories "image/", "style/" and "js/"
HTMLPREFIX = ""
# alternative directory on local disk where image/, style/ and js/ are located
HTMLDIR = "/usr/local/apache/htdocs/crispor/"
# directory of crispor.py
baseDir = dirname(__file__)
# filename of this script, usually crispor.py
myName = basename(__file__)
# the segments.bed files use abbreviated genomic region names
segTypeConv = {"ex":"exon", "in":"intron", "ig":"intergenic"}
# directory for processed batches of offtargets ("cache" of bwa results)
batchDir = join(baseDir,"temp")
# sqlite3 db with gzipped old json batch files, to avoid hitting the ext4 inode limits
batchArchive = "/data/crisporJobArchive.db"
# the file where the sqlite job queue is stored
#JOBQUEUEDB = join(TEMPDIR, "crisporJobs.db") # TEMPDIR is mapped away for security reasons under Redhat/Centos for CGIs
JOBQUEUEDB = "/data/www/temp/crisporJobs.db"
# alternatively: connection info for mysql
jobQueueMysqlConn = {"socket":None, "host":None, "user": None, "password" : None}
# directory for platform-independent scripts (e.g. Heng Li's perl SAM parser)
scriptDir = join(baseDir, "bin")
# directory for helper binaries (e.g. BWA)
binDir = abspath(join(baseDir, "bin", platform.system()))
# directory for genomes
genomesDir = join(baseDir, "genomes")
DEFAULTORG = 'hg19'
DEFAULTSEQ = 'cttcctttgtccccaatctgggcgcgcgccggcgccccctggcggcctaaggactcggcgcgccggaagtggccagggcgggggcgacctcggctcacagcgcgcccggctattctcgcagctcaccatgGATGATGATATCGCCGCGCTCGTCGTCGACAACGGCTCCGGCATGTGCAAGGCCGGCTTCGCGGGCGACGATGCCCCCCGGGCCGTCTTCCCCTCCATCGTGGGGCGCC'
# used if hg19 is not available
ALTORG = 'sacCer3'
ALTSEQ = 'ATTCTACTTTTCAACAATAATACATAAACatattggcttgtggtagCAACACTATCATGGTATCACTAACGTAAAAGTTCCTCAATATTGCAATTTGCTTGAACGGATGCTATTTCAGAATATTTCGTACTTACACAGGCCATACATTAGAATAATATGTCACATCACTGTCGTAACACTCT'
pamDesc = [ ('NGG','20bp-NGG - Sp Cas9, SpCas9-HF1, eSpCas9 1.1'),
('NNG','20bp-NNG - Cas9 S. canis'),
('NGN','20bp-NGN - SpG'),
('NNGT','20bp-NNGT - Cas9 S. canis - high efficiency PAM, recommended'),
('NAA','20bp-NAA - iSpyMacCas9'),
#('TTN','TTN-23bp - Cpf1 F. Novicida'), # Jean-Paul: various people have shown that it's not usable yet
('NNGRRT','21bp-NNG(A/G)(A/G)T - Cas9 S. Aureus'),
('NNGRRT-20','20bp-NNG(A/G)(A/G)T - Cas9 S. Aureus with 20bp-guides'),
('NGK','20bp-NG(G/T) - xCas9, recommended PAM, see notes'),
#('NGN','20bp-NGN or GA(A/T) - xCas9 (low efficiency, not recommended)'),
#('NGG-BE1','20bp-NGG - BaseEditor1, modifies C->T'),
('NNNRRT','21bp-NNN(A/G)(A/G)T - KKH SaCas9'),
('NNNRRT-20','20bp-NNN(A/G)(A/G)T - KKH SaCas9 with 20bp-guides'),
('NGA','20bp-NGA - Cas9 S. Pyogenes mutant VQR'),
('NNNNCC','24bp-NNNNCC - Nme2Cas9'),
('NGCG','20bp-NGCG - Cas9 S. Pyogenes mutant VRER'),
('NNAGAA','20bp-NNAGAA - Cas9 S. Thermophilus'),
('NGGNG','20bp-NGGNG - Cas9 S. Thermophilus'),
('NNNNGMTT','20bp-NNNNG(A/C)TT - Cas9 N. Meningitidis'),
('NNNNACA','20bp-NNNNACA - Cas9 Campylobacter jejuni, original PAM'),
('NNNNRYAC','22bp-NNNNRYAC - Cas9 Campylobacter jejuni, revised PAM'),
('NNNVRYAC','22bp-NNNVRYAC - Cas9 Campylobacter jejuni, opt. efficiency'),
('TTCN','TTCN-20bp - CasX'),
('TTTV','TTT(A/C/G)-23bp - Cas12a (Cpf1) - recommended, 23bp guides'),
('TTTV-21','TTT(A/C/G)-21bp - Cas12a (Cpf1) - 21bp guides recommended by IDT'),
('TTTN','TTTN-23bp - Cas12a (Cpf1) - low efficiency'),
('ATTN','ATTN-23bp - BhCas12b v4'),
('NGTN','NGTN-23bp - ShCAST/AcCAST, Strecker et al, Science 2019'),
('TYCV','T(C/T)C(A/C/G)-23bp - TYCV As-Cpf1 K607R'),
('TATV','TAT(A/C/G)-23bp - TATV As-Cpf1 K548V'),
('TTTA','TTTA-23bp - TTTA LbCpf1'),
('TCTA','TCTA-23bp - TCTA LbCpf1'),
('TCCA','TCCA-23bp - TCCA LbCpf1'),
('CCCA','CCCA-23bp - CCCA LbCpf1'),
('GGTT','GGTT-23bp - CCCA LbCpf1'),
('YTTV','YTTV-20bp - MAD7 Nuclease, Lui, Schiel, Maksimova et al, CRISPR J 2020'),
('TTYN','TTYN- or VTTV- or TRTV-23bp - enCas12a E174R/S542R/K548R - Kleinstiver et al Nat Biot 2019'),
('NNNNCNAA','20bp-NNNNCNAA - Thermo Cas9 - Walker et al, Metab Eng Comm 2020'),
('NNN','20bp-NNN - SpRY, Walton et al Science 2020'), # https://science.sciencemag.org/content/368/6488/290.abstract
('NRN','20bp-NRN - SpRY (high efficiency PAM)'),
('NYN','20bp-NYN - SpRY (low efficiency PAM)'),
#('VTTV','(A/C)TT(A/C)-23bp - enCas12a S542R - Kleinstiver et al Nat Biot 2019'),
#('TRTV','T(A/G)T(A/C)-23bp - enCas12a K548R - Kleinstiver et al Nat Biot 2019'),
]
DEFAULTPAM = 'NGG'
# the default base editor modification window
DEFAULTBEWIN = "1-7"
# for some PAMs, there are alternative main PAMs. These are also shown on the main sequence panel
multiPams = {
#"NGN" : ["GAW"],
"TTYN" : ["VTTV", "TRTV"]
}
# these PAMs are not specific. Allow only short sequences for them.
slowPams = ["TTYN", "NNG"]
# allow only very short sequences for these
verySlowPams = ["NNN", "NRN", "NYN"]
# for some PAMs, we allow other alternative motifs when searching for offtargets
# MIT and eCrisp do that, they use the motif NGG + NAG, we add one more, based on the
# on the guideSeq results in Tsai et al, Nat Biot 2014
# The NGA -> NGG rule was described by Kleinstiver...Young 2015 "Improved Cas9 Specificity..."
# NNGTRRT rule for S. aureus is in the new protocol "SaCas9 User manual"
# ! the length of the alternate PAM has to be the same as the original PAM!
offtargetPams = {
"NGG" : ["NAG","NGA"],
#"NGN" : ["GAW"],
"NGK" : ["GAW"],
"NGA" : ["NGG"],
"NNGRRT" : ["NNGRRN"],
"TTTV" : ["TTTN"],
'ATTN' : ["TTTN", "GTTN"],
"TTYN" : ["VTTV", "TRTV"]
}
# maximum size of an input sequence
MAXSEQLEN = 2300
# maximum input size when specifying "no genome"
MAXSEQLEN_NOGENOME = 25000
# maximum input size when using xCas9 or sCanis
MAXSEQLEN2 = 600
# maximum input size for NNN SpRY or similar PAMs
MAXSEQLEN3 = 150
# BWA: allow up to X mismatches
maxMMs=4
# maximum number of occurences in the genome to get flagged as repeats.
# This is used in bwa samse, when converting the same file
# and for warnings in the table output.
MAXOCC = 60000
# the BWA queue size is 2M by default. We derive the queue size from MAXOCC
MFAC = 2000000/MAXOCC
# the length of the guide sequence, set by setupPamInfo
GUIDELEN=None
# length of the PAM sequence
PAMLEN=None
# the name of the base editor, if any. This is the flag to activate
# baseEditor mode in the UI
baseEditor = None
# input sequences are extended by X basepairs so we can calculate the efficiency scores
# and can better design primers
FLANKLEN=100
# the name of the currently processed batch, assigned only once
# in readBatchParams and only for json-type batches
batchName = ""
# are we doing a Cpf1 run?
# this variable changes almost all processing and
# has to be set on program start, as soon as we know
# the PAM we're running on
pamIsFirst=None
saCas9Mode=False
# Highly-sensitive mode (not for CLI mode):
# MAXOCC is increased in processSubmission() and in the html UI if only one
# guide seq is run
# Also, the number of allowed mismatches is increased to 5 instead of 4
#HIGH_MAXOCC=600000
#HIGH_maxMMs=5
# minimum off-target score of standard off-targets (those that end with NGG)
# This should probably be based on the CFD score these days
# But for now, I'll let the user do the filtering
MINSCORE = 0.0
# minimum off-target score for alternative PAM off-targets
# There is not a lot of data to support this cutoff, but it seems
# reasonable to have at least some cutoff, as otherwise we would show
# NAG and NGA like NGG and the data shows clearly that the alternative
# PAMs are not recognized as well as the main NGG PAM.
# so for now, I just filter out very degenerative ones. the best solution
# would be to have a special penalty on the CFD score, but CFS does not
# support non-NGG PAMs (is this actually true?)
ALTPAMMINSCORE = 1.0
# how much shall we extend the guide after the PAM to match restriction enzymes?
pamPlusLen = 5
# global flag to indicate if we're run from command line or as a CGI
commandLineMode = False
# names/order of efficiency scores to show in UI
cas9ScoreNames = ["fusi", "crisprScan", "rs3"]
allScoreNames = ["fusi", "fusiOld", "chariRank", "ssc", "wuCrispr", "doench", "wang", "crisprScan", "aziInVitro", "ccTop", "rs3"]
mutScoreNames = []
spCas9MutScoreNames = ["oof", 'lindel'] # lindel is only added for spCas9
otherMutScoreNames = ["oof"] # lindel is only added for spCas9
cpf1ScoreNames = ["seqDeepCpf1"]
saCas9ScoreNames = ["najm"]
# to make the CFD more comparable to the MIT score, Nicholas Parkinson suggests to multiply it with 100.
# can be switched on with the URL argument fixCfd=1
doCfdFix=False
# how many digits shall we show for each score? default is 0
scoreDigits = {
"ssc" : 1,
}
# List of AddGene plasmids, their long and short names:
addGenePlasmids = [
("43860", ("MLM3636 (Joung lab)", "MLM3636")),
("49330", ("pAc-sgRNA-Cas9 (Liu lab)", "pAcsgRnaCas9")),
("42230", ("pX330-U6-Chimeric_BB-CBh-hSpCas9 (Zhang lab) + derivatives", "pX330")),
("52961", ("lentiCRISPR v2 (Zhang lab)", "lentiCrispr")),
("52963", ("lentiGuide-Puro (Zhang lab)", "lentiGuide-Puro")),
]
addGenePlasmidsAureus = [
("61591", ("pX601-AAV-CMV::NLS-SaCas9-NLS-3xHA-bGHpA;U6::BsaI-sgRNA (Zhang lab)", "pX601")),
("61592", ("pX600-AAV-CMV::NLS-SaCas9-NLS-3xHA-bGHpA (Zhang lab)", "pX600")),
("61593", ("pX602-AAV-TBG::NLS-SaCas9-NLS-HA-OLLAS-bGHpA;U6::BsaI-sgRNA (Zhang lab)", "pX602")),
("65779", ("VVT1 (Joung lab)", "VVT1"))
]
# list of AddGene primer 5' and 3' extensions, one for each AddGene plasmid
# format: prefixFw, prefixRw, u6-G-suffix, restriction enzyme, link to protocol
addGenePlasmidInfo = {
"43860" : ("ACACC", "AAAAC", "G", "BsmBI", "https://www.addgene.org/static/data/plasmids/43/43860/43860-attachment_T35tt6ebKxov.pdf"),
"49330" : ("TTC", "AAC", "", "Bsp QI", "http://bio.biologists.org/content/3/1/42#sec-9"),
"42230" : ("CACC", "AAAC", "", "Bbs1", "https://www.addgene.org/static/data/plasmids/52/52961/52961-attachment_B3xTwla0bkYD.pdf"),
"52961" : ("CACC", "AAAC", "", "BsmBI", "https://www.addgene.org/static/data/plasmids/52/52961/52961-attachment_B3xTwla0bkYD.pdf"),
"61591" : ("CACC", "AAAC", "", "BsaI", "https://www.addgene.org/static/data/plasmids/61/61591/61591-attachment_it03kn5x5O6E.pdf"),
"61592" : ("CACC", "AAAC", "", "BsaI", "https://www.addgene.org/static/data/plasmids/61/61592/61592-attachment_iAbvIKnbqNRO.pdf"),
"61593" : ("CACC", "AAAC", "", "BsaI", "https://www.addgene.org/static/data/plasmids/61/61592/61592-attachment_iAbvIKnbqNRO.pdf"),
"65779": ("CACC", "AAAC", "", "BsmBI (aka Esp3l)", "https://www.addgene.org/static/data/plasmids/65/65779/65779-attachment_G8oNyvV6pA78.pdf"),
"52963": ("CACC", "AAAC", "", "BsmBI (aka Esp3l)", "https://www.addgene.org/static/data/plasmids/52/52963/52963-attachment_IPB7ZL_hJcbm.pdf")
}
# the barcodes for subpool tagging for oligo pool tables
satMutBarcodes = [
(0, "No Subpool barcode"),
(1, "Subpool 1: CGGGTTCCGT/GCTTAGAATAGAA"),
(2, "Subpool 2: GTTTATCGGGC/ACTTACTGTACC"),
(3, "Subpool 3: ACCGATGTTGAC/CTCGTAATAGC"),
(4, "Subpool 4: GAGGTCTTTCATGC/CACAACATA"),
(5, "Subpool 5: TATCCCGTGAAGCT/TTCGGTTAA"),
(6, "Subpool 6: TAGTAGTTCAGACGC/ATGTACCC"),
(7, "Subpool 7: GGATGCATGATCTAG/CATCAAGC"),
(8, "Subpool 8: ATGAGGACGAATCT/CACCTAAAG"),
(9, "Subpool 9: GGTAGGCACG/TAAACTTAGAACC"),
(10, "Subpool 10: AGTCATGATTCAG/GTTGCAAGTCTAG"),
]
# Restriction enzyme supplier codes
rebaseSuppliers = {
"B":"Life Technologies",
"C":"Minotech",
"E":"Agilent",
"I":"SibEnzyme",
"J":"Nippon Gene",
"K":"Takara",
"M":"Roche",
"N":"NEB",
"O":"Toyobo",
"Q":"Molecular Biology Resources",
"R":"Promega",
"S":"Sigma",
"V":"Vivantis",
"X":"EURx",
"Y":"SinaClon BioScience"
}
# labels and descriptions of eff. scores
scoreDescs = {
"doench" : ("Doench '14", "Range: 0-100. Linear regression model trained on 880 guides transfected into human MOLM13/NB4/TF1 cells (three genes) and mouse cells (six genes). Delivery: lentivirus. The Fusi score can be considered an updated version this score, as their training data overlaps a lot. See <a target='_blank' href='http://www.nature.com/nbt/journal/v32/n12/full/nbt.3026.html'>Doench et al.</a>"),
"wuCrispr" : ("Wu-Crispr", "Range 0-100. Aka 'Wong score'. SVM model trained on previously published data. The aim is to identify only a subset of efficient guides, many guides will have a score of 0. Takes into account RNA structure. See <a target='_blank' href='https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0784-0'>Wong et al., Gen Biol 2015</a>"),
"ssc" : ("Xu", "Range ~ -2 - +2. Aka 'SSC score'. Linear regression model trained on data from >1000 genes in human KBM7/HL60 cells (Wang et al) and mouse (Koike-Yusa et al.). Delivery: lentivirus. Ranges mostly -2 to +2. See <a target='_blank' href='http://genome.cshlp.org/content/early/2015/06/10/gr.191452.115'>Xu et al.</a>"),
"crisprScan" : ["Moreno-Mateos", "Also called 'CrisprScan'. Range: mostly 0-100. Linear regression model, trained on data from 1000 guides on >100 genes, from zebrafish 1-cell stage embryos injected with mRNA. See <a target=_blank href='http://www.nature.com/nmeth/journal/v12/n10/full/nmeth.3543.html'>Moreno-Mateos et al.</a>. Recommended for guides transcribed <i>in-vitro</i> (T7 promoter). Click to sort by this score. Note that under 'Show all scores', you can find a Doench2016 model trained on Zebrafish scores, Azimuth in-vitro, which should be slightly better than this model for zebrafish."],
"wang" : ("Wang", "Range: 0-100. SVM model trained on human cell culture data on guides from >1000 genes. The Xu score can be considered an updated version of this score, as the training data overlaps a lot. Delivery: lentivirus. See <a target='_blank' href='http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3972032/'>Wang et al.</a>"),
"chariRank" : ("Chari", "Range: 0-100. Support Vector Machine, converted to rank-percent, trained on data from 1235 guides targeting sequences that were also transfected with a lentivirus into human 293T cells. See <a target='_blank' href='http://www.nature.com/nmeth/journal/v12/n9/abs/nmeth.3473.html'>Chari et al.</a>"),
"fusi" : ("Doench '16", "Aka the 'Fusi-Score', since V4.4 using the version 'Azimuth', scores are slightly different than before April 2018 but very similar (click 'show all' to see the old scores). Range: 0-100. Boosted Regression Tree model, trained on data produced by Doench et al (881 guides, MOLM13/NB4/TF1 cells + unpublished additional data). Delivery: lentivirus. See <a target='_blank' href='http://biorxiv.org/content/early/2015/06/26/021568'>Fusi et al. 2015</a> and <a target='_blank' href='http://www.nature.com/nbt/journal/v34/n2/full/nbt.3437.html'>Doench et al. 2016</a> and <a target=_blank href='https://crispr.ml/'>crispr.ml</a>. Recommended for guides expressed in cells (U6 promoter). Click to sort the table by this score."),
"fusiOld" : ("OldDoench '16", "The original implementation of the Doench 2016 score, as received from John Doench. The scores are similar, but not exactly identical to the 'Azimuth' version of the Doench 2016 model that is currently the default on this site, since Apr 2018."),
"rs3" : ("Doench-RuleSet3", "The Doench Rule Set 3 (RS3) score (-200-+200). Similar to the Doench 2014 and Doench 2016/Fusi/Azimuth score, but updated and more accurate. See <a href='https://www.nature.com/articles/s41467-022-33024-2' target=_blank>. Scores shown are multiplied with 100 for easier display. RS3 is configured here to use the Hsu-TRACR sequence."),
"najm" : ("Najm 2018", "A modified version of the Doench 2016 score ('Azimuth'), by Mudra Hegde for S. aureus Cas9. Range 0-100. See <a target=_blank href='https://www.nature.com/articles/nbt.4048'>Najm et al 2018</a>."),
"ccTop" : ("CCTop", "The efficiency score used by CCTop, called 'crisprRank'."),
"aziInVitro" : ("Azimuth in-vitro", "The Doench 2016 model trained on the Moreno-Mateos zebrafish data. Unpublished model, gratefully provided by J. Listgarden. This should be better than Moreno-Mateos, but we have not found the time to evaluate it yet."),
"housden" : ("Housden", "Range: ~ 1-10. Weight matrix model trained on data from Drosophila mRNA injections. See <a target='_blank' href='http://stke.sciencemag.org/content/8/393/rs9.long'>Housden et al.</a>"),
"proxGc" : ("ProxGCCount", "Number of GCs in the last 4pb before the PAM"),
"seqDeepCpf1" : ("DeepCpf1", "Range: ~ 0-100. Convolutional Neural Network trained on ~20k Cpf1 lentiviral guide results. This is the score without DNAse information, 'Seq-DeepCpf1' in the paper. See <a target='_blank' href='https://www.nature.com/articles/nbt.4061'>Kim et al. 2018</a>"),
"oof" : ("Out-of-Frame", "Range: 0-100. Out-of-Frame score, only for deletions. Predicts the percentage of clones that will carry out-of-frame deletions, based on the micro-homology in the sequence flanking the target site. See <a target='_blank' href='http://www.nature.com/nmeth/journal/v11/n7/full/nmeth.3015.html'>Bae et al. 2014</a>. Click the score to show the predicted deletions."),
"lindel": ("Lindel", "Wei Chen Frameshift ratio (0-100). Predicts probability of a frameshift caused by any type of insertion or deletion. See <a href='https://academic.oup.com/nar/article/47/15/7989/5511473'>Wei Chen et al, Bioinf 2018</a>. Click the score to see the most likely deletions and insertions."),
}
# the headers for the guide and offtarget output files
guideHeaders = ["guideId", "targetSeq", "mitSpecScore", "cfdSpecScore", "offtargetCount", "targetGenomeGeneLocus"]
offtargetHeaders = ["guideId", "guideSeq", "offtargetSeq", "mismatchPos", "mismatchCount", "mitOfftargetScore", "cfdOfftargetScore", "chrom", "start", "end", "strand", "locusDesc"]
# library descriptions
libLabels = [
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4486245/
("human_brunello" , "Human, Brunello, Doench Nat Bio 2016 (recommended)"),
("human_avana" , "Human, Avana, Doench Nat Bio 2016"),
("human_geckov2" , "Human, GeCKO V2, Sanjana Nat Meth 2014"),
("mouse_brie" , "Mouse, Brie, Doench Nat Bio 2016 (recommended)"),
("mouse_geckov2" , "Mouse, GeCKO V2, Sanjana Nat Meth 2014"),
("mouse_asiago" , "Mouse, Asiago, Doench Nat Bio 2016"),
]
# a file crispor.conf in the directory of the script allows to override any global variable
myDir = dirname(__file__)
confPath =join(myDir, "crispor.conf")
if isfile(confPath):
exec(open(confPath).read())
#execfile(confPath)
cgiParams = None
# ====== END GLOBALS ============
def setupPamInfo(pam):
" modify a few globals based on the current pam "
global GUIDELEN
global pamIsFirst
global addGenePlasmids
global PAMLEN
global scoreNames
global baseEditor
global saCas9Mode
global mutScoreNames
global isSpg
pamOpt = None
if "-" in pam:
pam, pamOpt = pam.split("-")
if pamOpt=="BE1":
baseEditor = "BE1"
elif pamOpt=="spg":
isSpg = True
PAMLEN = len(pam)
pamIsFirst = False
scoreNames = cas9ScoreNames
if pamIsCasX(pam):
logging.debug("switching on CasX mode, guide length is 20bp")
GUIDELEN = 20
pamIsFirst = True
scoreNames = cpf1ScoreNames
if pamIsCpf1(pam):
logging.debug("switching on Cpf1 mode, guide length is 23bp")
GUIDELEN = 23
pamIsFirst = True
scoreNames = cpf1ScoreNames
if pamOpt:
GUIDELEN=int(pamOpt)
elif pam=="NGTN":
logging.debug("switching on Cpf1 mode for ShCAST, guide length is 23bp")
GUIDELEN = 23
pamIsFirst = True
elif pam=="NNNNRYAC" or pam=="NNNVRYAC":
GUIDELEN = 22
elif pam=="NNGRRT" or pam=="NNNRRT":
logging.debug("switching on S. aureus mode, guide length is 21bp")
addGenePlasmids = addGenePlasmidsAureus
GUIDELEN = 21
if pamOpt=="20":
GUIDELEN=20
saCas9Mode = True
scoreNames = saCas9ScoreNames
elif pam=="NNNNCC":
GUIDELEN = 24
else:
GUIDELEN = 20
if GUIDELEN==20 and pam=="NGG":
mutScoreNames = spCas9MutScoreNames
else:
mutScoreNames = otherMutScoreNames
logging.debug("Enzyme info: pam=%s, guideLen=%d, pamIsFirst=%s, saCas9Mode=%s" %
(pam, GUIDELEN, pamIsFirst, saCas9Mode))
return pam
# ==== CLASSES =====
class JobQueue:
"""
simple job queue, using a db table as a backend
jobs have different types and status. status can be updated while they run
job running times are kept and old job info is kept in a separate table
>>> q = JobQueue()
>>> q.openSqlite()
>>> q.clearJobs()
>>> q.waitCount()
0
>>> q.addJob("search", "abc123", "myParams")
True
only one job per jobId
>>> q.addJob("search", "abc123", "myParams")
False
>>> q.waitCount()
1
>>> q.getStatus("abc123")
'Waiting'
>>> q.startStep("abc123", "bwa", "Alignment with BWA")
>>> q.getStatus("abc123")
'Alignment with BWA'
>>> jobType, jobId, paramStr = q.popJob()
>>> q.waitCount()
0
>>> q.jobDone("abc123")
>>> q.waitCount()
0
can't pop from an empty queue
#>>> q.popJob()
#(None, None, None)
#>>> os.system("rm /tmp/tempCrisporTest.db")
#0
"""
_queueDef = (
'CREATE TABLE IF NOT EXISTS %s '
'('
' jobType text,' # either "index" or "search"
' jobId text %s,' # unique identifier
' paramStr text,' # parameters for jobs, like db, options, etc.
' isRunning int DEFAULT 0,' # indicates steps have started, done jobs are moved to doneJobs table
' stepName text,' # currently step, internal step name for timings
' stepLabel text,' # current step, human-readable status of job, for UI
' lastUpdate float,' # time of last update
' stepTimes text,' # comma-sep list of whole msecs, one per step
' startTime text ' # date+time when job was put into queue
')')
#def __init__(self):
#" no inheritance needed here "
#self.openSqlite(JOBQUEUEDB)
def openSqlite(self, dbName=JOBQUEUEDB):
self.dbName = dbName
self.conn = sqlite3.connect(dbName, timeout=10)
# trying to increase stability and locks?
if sqlite3.version_info[0] >= 3:
self.conn.execute("PRAGMA journal_mode=WAL;")
#self.conn.set_trace_callback(print) # for debugging: print all sql statements
try:
self.conn.execute(self._queueDef % ("queue", "PRIMARY KEY"))
except sqlite3.OperationalError:
errAbort("cannot open the file %s" % JOBQUEUEDB)
self.conn.commit()
self._chmodJobDb()
def _chmodJobDb(self):
# umask is not respected by sqlite, bug http://www.mail-archive.com/[email protected]/msg59080.html
try:
os.chmod(JOBQUEUEDB, 0o666)
except OSError:
# if the file was created by other job, we can't chmod, as we're the CGI. Just silently ignore this
pass
def addJob(self, jobType, jobId, paramStr):
" create a new job, returns False if not successful "
self._chmodJobDb()
sql = 'INSERT INTO queue (jobType, jobId, isRunning, lastUpdate, ' \
'stepTimes, paramStr, stepName, stepLabel, startTime) VALUES (:jobType, :jobId, :isRunning, :lastUpdate, ' \
':stepTimes, :paramStr, :stepName, :stepLabel, :startTime)'
now = "%.3f" % time.time()
values = {'jobType' : jobType, 'jobId' : jobId, 'isRunning' : 0, 'lastUpdate' : now, 'stepTimes':"", 'paramStr':paramStr, 'stepName':"wait",
"stepLabel":"Waiting", "startTime":now}
try:
cur = self.conn.cursor()
cur.execute(sql, values)
self.commitRetry()
return True
except sqlite3.IntegrityError as ev:
# if the job is already in the queue, on Python3 (not Python2!) an integrity error will be thrown.
# so this actually means that all is fine.
# instead of checking if the job exists and then add it, we just add it and tolerate the error, saves one query.
# (if the pipeline crashed or server was restarted, jobs may be on disk but not in the queue)
return True
except sqlite3.OperationalError:
errAbort("Cannot open DB file %s. Please contact %s" % (self.dbName, contactEmail))
def getStatus(self, jobId):
" return current job status label or None if job is not in queue"
sql = 'SELECT stepLabel FROM queue WHERE jobId=?'
try:
status = self.conn.execute(sql, (jobId,)).fetchmany(1)[0][0]
except StopIteration:
logging.debug("getStatus got StopIteration")
status = None
self.commitRetry()
return status
def dump(self):
" for debugging, write the whole queue table to stdout "
sql = 'SELECT * FROM queue'
for row in self.conn.execute(sql):
print("\t".join([str(x) for x in row]))
def jobInfo(self, jobId, isDone=False):
" for debugging, return all job info as a tuple "
print("job info<br>")
if isDone:
sql = 'SELECT * FROM doneJobs WHERE jobId=?'
else:
sql = 'SELECT * FROM queue WHERE jobId=?'
try:
row = next(self.conn.execute(sql, (jobId,)))
except StopIteration:
return []
return row
def commitRetry(self):
" try to commit 10 times "
tryCount = 0
while tryCount < 10:
try:
self.conn.commit()
break
except sqlite3.OperationalError:
time.sleep(3)
tryCount += 1
if tryCount >= 30:
raise Exception("Database locked for a long time")
def execute(self, cmd, *args):
" try to execute command 10 times "
tryCount = 0
while tryCount < 10:
try:
res = self.conn.execute(cmd, *args)
return res
except sqlite3.OperationalError:
time.sleep(3)
tryCount += 1
if tryCount >= 10:
raise Exception("Cannot execute %s, Database locked for a long time" % cmd)
def startStep(self, jobId, newName, newLabel):
" start a new step. Update lastUpdate, status and stepTime "
self.startTransaction()
sql = 'SELECT lastUpdate, stepTimes, stepName FROM queue WHERE jobId=?'
logging.debug(sql)
result = self.conn.execute(sql, (jobId,)).fetchmany(1)[0]
logging.debug(result)
lastTime, timeStr, lastStep = result
logging.debug("end")
lastTime = float(lastTime)
# append a string in format "stepName:milliSecs" to the timeStr
now = time.time()
timeDiff = "%d" % int((1000.0*(now - lastTime)))
newTimeStr = timeStr+"%s=%s" % (lastStep, timeDiff)+","
sql = 'UPDATE queue SET lastUpdate=?, stepName=?, stepLabel=?, stepTimes=?, isRunning=? WHERE jobId=?'
self.conn.execute(sql, (now, newName, newLabel, newTimeStr, 1, jobId))
self.commitRetry()
def startTransaction(self):
logging.debug("Starting transaction")
tryCount = 0
while tryCount < 10:
try:
self.conn.execute('BEGIN') # indicate that transaction should start now
break
except sqlite3.OperationalError:
logging.debug("Waiting since transaction start failed")
time.sleep(3)
tryCount += 1
if tryCount >= 10:
raise Exception("Database locked for a long time")
def jobDone(self, jobId):
" remove the job from the queue and add it to the queue log"
print("job done<br>")
self.startTransaction()
sql = 'SELECT * FROM queue WHERE jobId=?'
try:
row = next(self.conn.execute(sql, (jobId,)))
except StopIteration:
# return if the job has already been removed
logging.warn("jobDone - jobs %s has been removed already" % jobId)
self.commitRetry() # release lock
return
sql = 'DELETE FROM queue WHERE jobId=?'
self.conn.execute(sql, (jobId,))
self.commitRetry() # release lock
# good to have a log file of the old jobs
with open("doneJobs.tsv", "a") as ofh: # if this triggers an error: run 'touch doneJobs.tsv && chmod a+rw doneJobs.tsv' in the crispor dir.
row = [str(x) for x in row]
line = "\t".join(row)
ofh.write(line)
ofh.write("\n")
def waitCount(self):
" return number of waiting jobs, wait until database is ready "
sql = 'SELECT count(*) FROM queue WHERE isRunning=0'
count = None
while count is None:
try:
count = self.conn.execute(sql).fetchall()[0][0]
except sqlite3.OperationalError:
logging.debug("OperationalError on waitCount()")
time.sleep(1+random.random()/10)
return count
def popJob(self):
" return (jobType, jobId, params) of first waiting job and set it to running state "
print('pop job<br>')
self.startTransaction()
sql = 'SELECT jobType, jobId, paramStr FROM queue WHERE isRunning=0 ORDER BY lastUpdate LIMIT 1'
try:
jobType, jobId, paramStr = next(self.execute(sql))
except StopIteration:
logging.debug("No data for '%s'" % sql)
self.commitRetry() # unlock db
return None, None, None
sql = 'UPDATE queue SET isRunning=1 where jobId=?'
self.execute(sql, (jobId,))
self.commitRetry() # unlock db
return jobType, jobId, paramStr
def clearJobs(self):
" clear the job table, removing running jobs, too "
self.conn.execute("DELETE from queue")
self.commitRetry()
def close(self):
" "
self.conn.close()
# ====== FUNCTIONS =====
contentLineDone = False
# the queue workers should be able to never abort
doAbort = True
def getTwoBitFname(db):
" return the name of the twoBit file for a genome "
# at UCSC, try to use local disk, if possible
locPath = join("/scratch", "data", db, db+".2bit")
if isfile(locPath):
return locPath
path = join(genomesDir, db, db+".2bit")
return path
def errAbort(msg, isWarn=False):
" print err msg and exit "
if commandLineMode:
raise Exception(msg)
if not contentLineDone:
print("Content-type: text/html\n")
print('<div style="position: absolute; padding: 10px; left: 100; top: 100; border: 10px solid black; background-color: white; text-align:left; width: 800px; font-size: 18px">')
if isWarn:
print("<strong>Warning:</strong><p> ")
else:
print("<strong>Error:</strong><p> ")
print((msg+"<p>"))
print(("If you think this is a bug or you have any other suggestions, please do not hesitate to contact us %s<p>" % contactEmail))
if isWarn:
print("In the email, please also send us the full URL of the page.")
else:
print("Please also send us the full URL of the page where you see the error. Thanks!")
print('</div>')
if doAbort:
sys.exit(0) # cgi must not exit with 1
# allow only dashes, digits, characters, underscores and colons in the CGI parameters
# and +
notOkChars = re.compile(r'[^+a-zA-Z0-9/:\n\r_. -]')
def checkVal(key, inStr):
""" remove special characters from input string, to protect against injection attacks """
if key!="geneIds":
if len(inStr) > 10000:
errAbort("input parameter %s is too long" % key)
else:
if len(inStr) > 100000:
errAbort("Pasting more than tens of thousands of gene IDs makes little sense. Copy/paste error?")
matchObj =notOkChars.search(inStr)
if matchObj!=None:
errAbort("input parameter %s contains an invalid character %s (ASCII %d)" % (key, repr(matchObj.group()), ord(matchObj.group())))
return inStr
def cgiGetParams():
" get CGI parameters and return as dict "
form = cgi.FieldStorage()
global cgiParams
cgiParams = {}
# parameters are:
#"pamId", "batchId", "pam", "seq", "org", "download", "sortBy", "format", "ajax
for key in list(form.keys()):
val = form.getfirst(key)
if val!=None:
# "seq" is cleaned by cleanSeq later
val = urllib.parse.unquote(val)
if key not in ["seq", "name"]:
checkVal(key, val)
cgiParams[key] = val
if "pam" in cgiParams:
legalChars = set("ACTGNMKRYVBE120345/-")
illegalChars = set(cgiParams["pam"])-legalChars
if len(illegalChars)!=0:
errAbort("Illegal character in PAM-sequence. Only %s are allowed."+"".join(legalChars))
if "batchId" in cgiParams:
batchId = cgiParams["batchId"]
if not batchId.isalnum() or len(batchId) > 30:
errAbort("Invalid batchId")
return cgiParams
def cgiGetStr(params, argName, default=None):
val = params.get(argName, None)
if val==None and default==None:
errAbort("'%s' parameter must be specified" % argName)
if val==None:
return default
return val
def cgiGetNum(params, argName, default):
" get CGI parameter which must be a number "
val = params.get(argName, None)
if val==None:
return default
if not val.isdigit():
errAbort("'%s' parameter must be a number" % argName)
val = int(val)
return val
transTab = str.maketrans("-=/+_", "abcde")
def makeTempBase(seq, org, pam, batchName):
"create the base name of temp files using a hash function and some prettyfication "
hasher = hashlib.sha1(seq.encode("latin1")+org.encode("latin1")+pam.encode("latin1")+batchName.encode("latin1"))
shortHash = hasher.digest()[0:20]
batchId = base64.urlsafe_b64encode(shortHash).decode('latin1').translate(transTab)[:20]
return batchId
def makeTempFile(prefix, suffix):
" return a temporary file that is deleted upon exit, unless DEBUG is set "
if DEBUG:
fname = join("/tmp", prefix+suffix)
fh = open(fname, "wt")
else:
fh = tempfile.NamedTemporaryFile(mode="wt", dir=TEMPDIR, prefix="primer3In", suffix=".txt")
return fh
def pamIsCpf1(pam):
" if you change this, also change bin/filterFaToBed and bin/samToBed!!! "
return (pam in ["TTN", "TTTN", "TYCV", "TATV", "TTTV", "TTTR", "ATTN", "TTTA", "TCTA", "TCCA", "CCCA", "YTTV", "TTYN"])
def pamIsCasX(pam):
" if you change this, also change bin/filterFaToBed and bin/samToBed!!! "
return (pam in ["TTCN"])
def pamIsSaCas9(pam):
" only used for notes and efficiency scores, unlike its Cpf1 cousin function "
return (pam.split("-")[0] in ["NNGRRT", "NNNRRT"])
def isSlowPam(pam):
" do not allow input sequences > 500 bp "
if pamIsXCas9(pam) or pam=="TTYN" or pam=="NNG":
return True
else:
return False
def pamIsXCas9(pam):
" "
return (pam in ["NGK", "NGN"])
def pamIsSpCas9(pam):
" only used for notes and efficiency scores, unlike its Cpf1 cousin function "
return (pam in ["NGG", "NGA", "NGCG"])
def saveSeqOrgPamToCookies(seq, org, pam):
" create a cookie with seq, org and pam and print it"
cookies=http.cookies.SimpleCookie()
expires = 365 * 24 * 60 * 60
if len(seq)<3000:
cookies['lastseq'] = seq
else:
cookies['lastseq'] = "(last sequence was too long, could not be saved in Internet Browser cookie)"
cookies['lastseq']['expires'] = expires
cookies['lastorg'] = org
cookies['lastorg']['expires'] = expires
cookies['lastpam'] = pam
cookies['lastpam']['expires'] = expires
print(cookies)
def debug(msg):
if commandLineMode:
logging.debug(msg)
elif DEBUG:
print(msg)
print("<br>")
def gcContent(seq):
" return GC content as a float "
c = 0
for x in seq:
if x in ["G","C"]:
c+= 1
return (float(c)/len(seq))
def findPat(seq, pat):
""" yield positions where pat matches seq, stupid brute force search
"""
seq = seq.upper()
pat = pat.upper()
patLen = len(pat)
for i in range(0, len(seq)-patLen+1):
subseq = seq[i:i+patLen]
if patMatch(subseq, pat):
yield i
def rndSeq(seqLen):
" return random seq "
seq = []
alf = "ACTG"
for i in range(0, seqLen):
seq.append(alf[random.randint(0,3)])
return "".join(seq)
def cleanSeq(seq, db):
""" remove fasta header, check seq for illegal chars and return (filtered
seq, user message) special value "random" returns a random sequence.
"""
#print repr(seq)
if seq.startswith("random"):
seq = rndSeq(800)
lines = seq.strip().splitlines()
#print "<br>"
#print "before fasta cleaning", "|".join(lines)
if len(lines)>0 and lines[0].startswith(">"):
line1 = lines.pop(0)
#print "<br>"
#print "after fasta cleaning", "|".join(lines)
#print "<br>"
newSeq = []
nCount = 0
for l in lines:
if len(l)==0: