-
Notifications
You must be signed in to change notification settings - Fork 21
/
pubConvFiles
executable file
·202 lines (172 loc) · 8.32 KB
/
pubConvFiles
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python
# first load the standard libraries from python
# we require at least python 2.5
#from sys import *
from __future__ import print_function
import sys
if sys.version_info[0]==2 and not sys.version_info[1]>=7:
print("Sorry, this program requires at least python 2.7")
print("You can download a more current python version from python.org and compile it")
print("into your homedir with 'configure --prefix ~/python'; make;")
print("then run this program by specifying your own python executable like this: ")
print(" ~/python/bin/python ~/pubtools/pubtools")
print("or add python/bin to your PATH before /usr/bin, then run pubtools itself")
exit(1)
# load default python packages
import logging, optparse, os, collections, tarfile, mimetypes
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
# now load our own libraries
import pubGeneric, pubStore, pubConf, maxCommon, pubPubmed
from pubXml import *
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <in> <out> - convert a local directory with textfiles or PDFs (name: <PMID>.pdf or <PMID>.txt) and suppl files (name: <PMID>.supp<count>.pdf or .txt) to pubtools format. Get article meta information from NCBI Eutils or local medline copy.
Make sure that you set the minId parameter.
(ArticleIds should not overlap between different datasets.)
Example:
pubConvFiles /hive/data/outside/pubs/spliceAid/ /hive/data/inside/literature/text/spliceAid2/ -minId=5000000000
""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages")
parser.add_option("", "--minId", dest="minId", action="store", type="int", help="numerical IDs written to the pubStore start at this number prevent overlaps of numerical IDs, default %default", default=pubConf.identifierStart["pdfDir"])
parser.add_option("-p", "--parse", dest="parseFile", action="store", help="only parse a single file (for debugging)")
parser.add_option("", "--notPmid", dest="notPmid", action="store_true", help="filenames are not <PMID>.pdf but some other identifier. Will result in 0_articles.gz files without any meta information")
parser.add_option("", "--localMedline", dest="localMedline", action="store_true", help="Can get PMID article info either from NCBI Eutils or a local copy of Medline. This option activates local lookups with a fallback to remote queries.")
#parser.add_option("-e", "--encoding", dest="encoding", action="store", help="encoding to convert to, by default input is assumed to be in UTF8 and is also encoded as such")
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
def createIndex(inDir, outDir, minId):
" get all PMIDs from dir and create index file in outDir "
files = os.listdir(inDir)
logging.info("Reading input dir %s" % inDir)
# create dict pmid -> set of filenames
idFiles = {}
for fname in files:
fileId = basename(fname).split(".")[0]
idFiles.setdefault(fileId, set()).add(fname)
logging.info("Found %d files with %d article identifiers" % (len(files), len(idFiles)))
indexFname = join(outDir, "index.tab")
indexFile = open(indexFname, "w")
logging.info("Writing index file %s" % indexFname)
# write index file
headers = ["chunkId", "articleId", "externalId", "mainFile", "suppFiles"]
indexFile.write("\t".join(headers)+"\n")
articleId = minId
for extId, files in idFiles.iteritems():
logging.debug("processing file with extId %s" % extId)
if not extId.isdigit():
continue
chunkId = "0_00000"
mainFile = extId
if mainFile+".pdf" in files:
files.remove(mainFile+".pdf")
if mainFile+".txt" in files:
files.remove(mainFile+".txt")
row = [chunkId, str(articleId), extId, mainFile, ",".join(files)]
indexFile.write("\t".join(row)+"\n")
articleId += 1
indexFile.close()
return indexFname
def createFileData(baseDir, fname, extId, isSupp):
" create fileData dict from fname "
logging.debug("Creating file data for %s" % repr(fname))
fileData = pubStore.createEmptyFileDict()
if isSupp:
fileData["desc"] = "supplement (%s)" % basename(fname).split(".")[1]
fileType = "supp"
else:
fileData["desc"] = "main text (pdf)"
fileType = "main"
fileData["fileType"] = fileType
fileData["url"] = fname
mainPath = join(baseDir, fname)
if not isfile(mainPath):
mainPath = join(baseDir, fname+".txt")
if not isfile(mainPath):
mainPath = join(baseDir, fname+".pdf")
if not isfile(mainPath):
maxCommon.errAbort("%s not found" % mainPath)
assert(isfile(mainPath))
fileData["mimeType"] = mimetypes.guess_type(mainPath)[0]
fileData["content"] = open(mainPath).read()
if fileData["mimeType"]!="text/plain":
fileData = pubGeneric.toAsciiEscape(fileData)
return fileData
def createArticleData(externalId, notPmid, localMedline):
" create article data dict "
if notPmid:
articleData = pubStore.createEmptyArticleDict()
articleData["externalId"] = externalId+".pdf"
else:
articleData = pubStore.lookupArticleByPmid(["medline"], externalId, localMedline)
if articleData==None:
logging.debug("No local information for %s" % externalId)
ncbiList = list(pubPubmed.ncbiEFetchGenerator([externalId]))
if len(ncbiList)==0:
return None
articleData = ncbiList[0]
#articleData["externalId"] = "PMID"+externalId
#articleData["fulltextUrl"] = "www.ncbi.nlm.nih.gov/pubmed/%s" % externalId
articleData["source"] = "pdfDir"
return articleData
def convertFiles(inDir, outDir, minId, parseFname, notPmid, localMedline):
" index files and convert to pubtools file in outDir, first articleId is minId "
indexFn = createIndex(inDir, outDir, minId)
chunkId = "0_00000"
writer = pubStore.PubWriterFile(join(outDir, chunkId))
failed = []
for row in maxCommon.iterTsvRows(indexFn):
# article data
externalId = row.mainFile
articleId = row.articleId
mainFile = row.mainFile
if parseFname!=None and mainFile != parseFname:
continue
suppFiles = row.suppFiles.split(",")
logging.info("Converting article data for %s, articleId %s, suppFiles %s" % (mainFile, articleId, ",".join(suppFiles)))
articleData = createArticleData(externalId, notPmid, localMedline)
if articleData==None:
failed.append("metaData:"+externalId)
continue
# we have no fields for these in our current article schema
del articleData["mid"]
del articleData["pii"]
# file data of main file
mainFileData = createFileData(inDir, mainFile, externalId, False)
fileCount = 0
fileId = ((10**pubConf.FILEDIGITS)*int(articleId))+fileCount
if mainFileData==None:
failed.append(mainFile)
logging.info("Conversion not successful")
continue
writer.writeFile(articleId, fileId, mainFileData, externalId=articleData["externalId"])
writer.writeArticle(articleId, articleData)
# file data of supp files
for suppFile in suppFiles:
if suppFile=="":
continue
fileCount += 1
fileId = (10**pubConf.FILEDIGITS*int(articleId))+fileCount
fileData = createFileData(inDir, suppFile, externalId, True)
if fileData != None:
writer.writeFile(articleId, fileId, fileData)
else:
failed.append(suppFile)
writer.close()
if len(failed)!=0:
logging.info("These PDFs could not be converted: %s"% ",".join(failed))
# ----------- MAIN --------------
if args==[]:
parser.print_help()
exit(1)
inDir, outDir = args
maxCommon.mustExist(inDir)
minId = options.minId
pubGeneric.setupLogging(progFile, options)
maxCommon.mustExistDir(outDir)
maxCommon.mustBeEmptyDir(outDir)
convertFiles(inDir, outDir, minId, options.parseFile, options.notPmid, options.localMedline)