-
Notifications
You must be signed in to change notification settings - Fork 21
/
pubMap
executable file
·177 lines (139 loc) · 7.15 KB
/
pubMap
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python2
# first load the standard libraries from python
# we require at least python 2.7
from __future__ import print_function
import sys
if sys.version_info[0]==2 and not sys.version_info[1]>=7:
print("Sorry, this program requires at least python 2.7")
print("You can download a more current python version from python.org and compile it")
print("into your homedir (or anywhere else) with 'configure --prefix ~/python27'; make;")
print("then run this program again by specifying your own python executable like this: ")
print(" ~/python27/bin/python <%s>" % sys.argv[0])
print("or add ~/python27/bin to your PATH before /usr/bin")
exit(1)
# load default python packages
import logging, optparse, os, atexit, datetime
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
# now load our own libraries
import pubConf, pubGeneric, pubAlg, maxCommon, tabfile, pubMap, pubMapProp
# GLOBAL: filename of lockfile
lockFname = None
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser(
"""usage: %prog [options] <publisher> <step> - pipeline to map fulltext files to genome locations and
create tracks for genome browser
If target directory already exists, will only process updateIds which have not been processed yet
pubMap requires some external data:
- UCSC genome sequences as 2bit, found via pubConf.GBCONF which points to genbank.conf
- optional: genome loci with an assignment genomic range <-> entrez gene Id &
symbol, created by pubPrepCdnaDir
- optional: cdna sequences created by pubPrepCdnaDir
- optional: genome sequences not part of genbank.conf as 2bit, found via
pubConf.nonUcscGenomesDir
STEPS FOR SCANNING TEXT FOR SEQUENCES:
"annot": extract DNA/protein seqs and accessions from text files
(annotators: scripts/dnaFinder.py, scripts/protFinder.py, scripts/markerFinder.py)
example:
pubMap pmc annot
optional:
"annotMarkerDbg": runs only the accession finder, for testing
"annotSeqDbg": runs only the DNA/protein finders, for testing
STEPS FOR BLATTING ONTO GENOMES/CDNA/PROT:
(use pubPrepCdnaDir to build the directory with mRNA sequences)
"filter": remove duplicate and very short DNA/Prot sequences, convert to fasta files
one fasta file per genome
input: <baseDir>/annot, output: <baseDir>/blatGenome/seq and <baseDir>/blatProt/seq
example: pubMap pmc filter
"blat": submit blat jobs of DNA against genomes, cdna, proteins,
separates job into short/long sequences
output: <baseDir>/blat/{genome,cdna,prot}
example: pubMap pmc blat
"sort": sort all blat output files, map cdna/prot to genome, run pslCDnaFilter
example: pubMap pmc sort
"chain": merge cDNA, genome and protein DNA psl from all species into one file, split and
chain psls and convert to a raw version of bed (for debugging as custom track)
input: <baseDir>/{blatGenome,blatCdna}/sortedPsl/
output: <baseDir>/blatGenome/bed
example:
pubMap pmc chain
STEPS FOR LOADING RESULTS INTO GENOME BROWSER:
"identifiers": run a map/reduce job to retrieve
the names of files for all fileIds and write to
<outDir>/files.tab
"tables": create tables for hgLoadxxx
- reads in articleIds.lst from <baseDir>/markers
- reads impacts, article descriptions and article classification results
- reads sequence info from <outDir>/seqs
- uses textDir.conf to find the fulltext files and parse
out basic article information
- writes article and seq info to tab-files in <outDir>/tables/
example: pubMap pmc tables
"load": load all tables with hgLoadSql/hgLoadBed into browser
example:
pubMap pmc,elsevier load
Will process all of <outDir> tables and <outDir>/updates/<i>/tables
Note that you need to supply --loadFinal if you really want to
write to the real hgwdev tables.
This command will use the table hgFixed.pubsLoadedFiles to keep
track of already loaded files. It will not load files that have already
been loaded into mysql before.
OTHER COMMANDS:
"<type>": run a list of the steps described above.
<type> can be either "all" or a range, like "annot-load"
"all" is the same as "annot-tables"
dropAll: remove all tables (by default the pubsDev... tables)
switchOver: move all tables from the pubsDev to the pubs prefix
expFa: export all sequences as fasta files to pubConf.faDir
expCdr3: export all CDR3-like prot seqs as fasta and tab files to pubConf.cdr3Dir
""")
parser.add_option("-s", "--skipConvert", dest="skipConvert", action="store_true", help="do not convert sequences or convert to fasta or sort/chain, only do the second part of the processing for blat, blatCdna or chain. Used for debugging.", default=False)
parser.add_option("-o", "--outDir", dest="outDir", action="store", help="overwrite the default output directory from pubConf, default %default", default=pubConf.pubMapBaseDir)
parser.add_option("-l", "--activateLog", dest="activateLog", action="store_true", help="write logging to pubMap.<date>.log", default=False)
parser.add_option("", "--onlyDb", dest="onlyDb", action="append", help="only use certain genomes for blatting, can be specified several times")
parser.add_option("", "--tablePrefix", dest="tablePrefix", action="store", help="by default, the 'load' step will load into tables with the pubsDev prefix. If you want to load into final tables, set this to ''. Default value %default. The prefix 'pubs' is always added to the prefix.", default="Dev")
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
def main(args, options):
if options.debug:
pubMap.DEBUG=True
dataset, command = args
allSteps = ["annot", "annotMarkerDbg", "annotSeqDbg", "filter", "blat", "sort",
"chain", "identifiers", "tables", "load"]
if "-" in command:
stepFrom, stepTo = command.split("-")
pubMap.runStepRange(dataset, allSteps, stepFrom, stepTo, args, options)
elif command=="all":
pubMap.runStepRange(dataset, allSteps, "annot", "tables", args, options)
else:
pubMap.runStep(command, dataset, options)
def removeLock():
global lockFname
logging.debug("Removing lockfile %s" % lockFname)
os.remove(lockFname)
def checkCreateLock(outDir):
" creates lockfile, squeaks if exists, register exit handler to delete "
global lockFname
lockFname = join(outDir, "_pubMap.lock")
if isfile(lockFname):
raise Exception("File %s exists - it seems that pipeline is already running. \
If you're sure that this is not the case, remove the lockfile and retry again" % lockFname)
logging.debug("Creating lockfile %s" % lockFname)
open(lockFname, "w")
atexit.register(removeLock) # register handler that is executed on program exit
# ----------- MAIN --------------
if len(args)<2:
parser.print_help()
exit(1)
# setup logging
logFileName = None
dateStr = datetime.datetime.now().strftime("%d%m%y")
logFileName = join(pubConf.logDir, 'pubMap.%s.log' % dateStr)
fileMode = "a"
pubGeneric.setupLogging(__file__, options, logFileName=logFileName, fileMode=fileMode)
main(args, options)