-
Notifications
You must be signed in to change notification settings - Fork 6
/
ticcl.nf
executable file
·613 lines (498 loc) · 21.6 KB
/
ticcl.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
#!/usr/bin/env nextflow
/*
vim: syntax=groovy
-*- mode: groovy;-*-
*/
log.info "--------------------------"
log.info "TICCL Pipeline"
log.info "--------------------------"
def env = System.getenv()
//Set default parameter values
params.virtualenv = env.containsKey('VIRTUAL_ENV') ? env['VIRTUAL_ENV'] : "" //automatically detects whether we are running in a Virtual Environment (one of the LaMachine flavours)
params.language = "nld"
params.extension = "folia.xml"
params.inputtype = "folia"
params.outputdir = "ticcl_output"
params.inputclass = "current"
params.outputclass = "current"
params.lexicon = ""
params.artifrq = 10000000
params.alphabet = ""
params.distance = 2
params.clip = 1
params.low = 5
params.high = 35
params.chainclean = 0
params.ngram = 1
//Output usage information if --help is specified
if (params.containsKey('help')) {
log.info "Usage:"
log.info " ticcl.nf [OPTIONS]"
log.info ""
log.info "Mandatory parameters:"
log.info " --inputdir DIRECTORY Input directory (FoLiA documents with an OCR text layer)"
log.info " --lexicon FILE Path to lexicon file (*.dict)"
log.info " --alphabet FILE Path to alphabet file (*.chars)"
log.info " --charconfus FILE Path to character confusion list (*.confusion)"
log.info ""
log.info "Optional parameters:"
log.info " --outputdir DIRECTORY Output directory (FoLiA documents)"
log.info " --language LANGUAGE Language"
log.info " --extension STR Extension of FoLiA documents in input directory (default: folia.xml, must always end in xml)!"
log.info " --inputclass CLASS FoLiA text class to use for input, defaults to 'current' for FoLiA input; must be set to 'OCR' for FoLiA documents produced by ocr.nf"
log.info " --outputclass CLASS FoLiA text class to use for output, defaults to 'current' for FoLiA output, but may not be equal to the class used for --inputclass"
log.info " --inputtype STR Input type can be either 'folia' (default), 'text', or 'pdf' (i.e. pdf with text; no OCR)"
log.info " --virtualenv PATH Path to Virtual Environment to load (usually path to LaMachine)"
log.info " --artifrq INT Default value for missing frequencies in the validated lexicon (default: 10000000)"
log.info " --distance INT Levenshtein/edit distance (default: 2)"
log.info " --clip INT Limit the number of variants per word (default: 10)"
log.info " --corpusfreqlist FILE Corpus frequency list (skips the first step that would compute one for you)"
log.info " --low INT skip entries from the anagram file shorter than 'low' characters. (default=5)"
log.info " --high INT skip entries from the anagram file longer than 'high' characters. (default=35)"
log.info " --chainclean BOOLINT enable chain clean or not (1 = on, 0 = off, default)"
log.info " --nofoliacorrect skip the FoLiA correct step"
log.info " --nostringlinking skip the final string linking step"
exit 2
}
//Check mandatory parameters and produce sensible error messages
if (!params.containsKey('inputdir')) {
log.info "Error: Missing --inputdir parameter, see --help for usage details"
} else {
def dircheck = new File(params.inputdir)
if (!dircheck.exists()) {
log.info "Error: Specified input directory does not exist"
exit 2
}
}
if (!params.containsKey('lexicon')) {
log.info "Error: Missing --lexicon parameter, see --help for usage details"
exit 2
}
if (!params.containsKey('alphabet')) {
log.info "Error: Missing --alphabet parameter, see --help for usage details"
exit 2
}
if (!params.containsKey('charconfus')) {
log.info "Error: Missing --charconfus parameter, see --help for usage details"
exit 2
}
//Initialise channels from various input files specified in parameters, these will be consumed as input by a process later on
lexicon = Channel.fromPath(params.lexicon).ifEmpty("Lexicon file not found")
alphabet = Channel.fromPath(params.alphabet).ifEmpty("Alphabet file not found")
charconfuslist = Channel.fromPath(params.charconfus).ifEmpty("Character confusion file not found")
inputclass = "OCR" //default internal inputclass (will be overriden with the default 'current' in case of FoLiA input)
if (params.inputtype == "folia") {
//Create two identical channels (folia_ocr_document & input_overview) globbing all FoLiA documents in the input directory (recursively!)
//the input_overview channel will be consumed immediately, simply printing all input filenames
Channel.fromPath(params.inputdir+"/**." + params.extension).into { folia_ocr_documents; input_overview }
input_overview.subscribe { println "TICCL FoLiA input: ${it.baseName}" }
inputclass = params.inputclass //use user-supplied input class (default to 'current')
} else if (params.inputtype == "text") {
//Create two identical channel globbing all text documents in the input directory (recursively!)
Channel.fromPath(params.inputdir+"/**.txt").filter { it.baseName != "trace" }.into { textdocuments; input_overview }
input_overview.subscribe { println "TICCL text input: ${it.baseName}" }
} else if (params.inputtype == "pdf") {
//Create two identical channel globbing all PDF documents in the input directory (recursively!)
pdfdocuments = Channel.fromPath(params.inputdir+"/**.pdf")
Channel.fromPath(params.inputdir+"/**.pdf").into { pdfdocuments; input_overview }
input_overview.subscribe { println "TICCL PDF input: ${it.baseName}" }
inputclass = "OCR"
process pdf2text {
/*
convert PDF to Text with pdftotext
*/
input:
file pdfdocument from pdfdocuments
output:
file "${pdfdocument.baseName}.txt" into textdocuments
script:
"""
#!/bin/bash
pdftotext -nopgbrk -eol unix "$pdfdocument" "${pdfdocument.baseName}.txt"
"""
}
} else {
log.error "No such inputtype: " + params.inputtype
exit 2
}
if ((params.inputtype == "text") || (params.inputtype == "pdf")) { //(pdf will have been converted to text by prior process)
process txt2folia {
/*
Convert txt to FoLiA with FoLiA-txt
*/
input:
file textdocument from textdocuments
val virtualenv from params.virtualenv
output:
file "${textdocument.baseName}.folia.xml" into folia_ocr_documents
script:
"""
#!/bin/bash
#set up the virtualenv (bit unelegant currently, but we have to do this for each process to ensure the LaMachine environment works)
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
FoLiA-txt --class OCR -t 1 -O . "${textdocument}" || exit 1
if [ ! -s "${textdocument.baseName}.folia.xml" ]; then
echo "ERROR: Expected output ${textdocument.baseName}.folia.xml does not exist or is empty">&2
exit 6
fi
"""
}
}
//fork the above output channel into two so it can be used as input by two processes (a channel is consumed upon input)
folia_ocr_documents.into { folia_ocr_documents_forcorpusfrequency; folia_ocr_documents_forfoliacorrect }
if (params.containsKey('corpusfreqlist')) {
//a corpus frequency is list explicitly provided as parameter, set up a channel
corpusfreqlist = Channel.fromPath(params.corpusfreqlist)
} else {
//no corpus frequency list is provided, so we compute one with FoLiA-stats
process corpusfrequency {
/*
Process corpus into frequency file for TICCL (with FoLiA-stats)
*/
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (rather than deleting this intermediate output)
input:
file "doc*." + params.extension from folia_ocr_documents_forcorpusfrequency
val virtualenv from params.virtualenv
val inputclass from inputclass
val extension from params.extension
val ngram from params.ngram
output:
file "corpus.wordfreqlist.tsv" into corpusfreqlist
script:
"""
#!/bin/bash
#set up the virtualenv if necessary
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
FoLiA-stats --class "$inputclass" -s -t ${task.cpus} -e "$extension" --lang=none --collect --max-ngram ${ngram} --separator "_" -o corpus . || exit 1
mv corpus.wordfreqlist.?to?.tsv corpus.wordfreqlist.tsv
if [ ! -s "corpus.wordfreqlist.tsv" ]; then
echo "ERROR: Expected output corpus.wordfreqlist.tsv does not exist or is empty">&2
exit 6
fi
"""
}
}
alphabet_forunk = Channel.fromPath(params.alphabet).ifEmpty("Alphabet file not found")
process ticclunk {
/*
Filter a wordfrequency list (TICCL-unk)
*/
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (rather than deleting this intermediate output)
input:
file corpusfreqlist from corpusfreqlist //corpus frequency list in FoLiA-stats format
file lexicon from lexicon
file alphabet from alphabet_forunk
val virtualenv from params.virtualenv
val artifrq from params.artifrq
output:
file "${corpusfreqlist}.clean" into corpusfreqlist_clean //cleaned wordfrequency file
file "${corpusfreqlist}.unk" into unknownfreqlist //unknown words list
file "${corpusfreqlist}.punct" into punctuationmap //list of words mapping strings with leading/trailing punctuation to clean variants
script:
"""
#!/bin/bash
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
TICCL-unk --background "${lexicon}" --artifrq ${artifrq} --alph "${alphabet}" "${corpusfreqlist}" || exit 1
if [ ! -s "${corpusfreqlist}.clean" ]; then
echo "ERROR: Expected output ${corpusfreqlist}.clean does not exist or is empty">&2
exit 6
fi
"""
}
//fork the above output channel so it can be used as input for THREE processes
corpusfreqlist_clean.into { corpusfreqlist_clean_foranahash; corpusfreqlist_clean_forresolver; corpusfreqlist_clean_forindexer }
process anahash {
/*
Read a clean wordfrequency list , and hash all items with TICCL-anahash
*/
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (rather than deleting this intermediate output)
input:
file corpusfreqlist from corpusfreqlist_clean_foranahash
file alphabet from alphabet
val virtualenv from params.virtualenv
val artifrq from params.artifrq
output:
file "${corpusfreqlist}.anahash" into anahashlist
file "${corpusfreqlist}.corpusfoci" into corpusfocilist
script:
"""
#!/bin/bash
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
TICCL-anahash --alph "${alphabet}" --artifrq ${artifrq} "${corpusfreqlist}" --ngrams || exit 1
if [ ! -s "${corpusfreqlist}.anahash" ]; then
echo "ERROR: Expected output ${corpusfreqlist}.anahash does not exist or is empty">&2
exit 6
fi
if [ ! -s "${corpusfreqlist}.corpusfoci" ]; then
echo "ERROR: Expected output ${corpusfreqlist}.corpusfoci does not exist or is empty">&2
exit 6
fi
"""
}
//fork channels so we can consume them from multiple processes
anahashlist.into { anahashlist_forindexer; anahashlist_forresolver }
charconfuslist.into { charconfuslist_forindexer; charconfuslist_forrank }
process indexer {
/*
Computes an index from anagram hashes (TICCL-indexerNT)
*/
publishDir params.outputdir, mode: 'copy', overwrite: true
label "multicore"
input:
file corpusfreqlist from corpusfreqlist_clean_forindexer //only used for naming purposes, not real input
file anahashlist from anahashlist_forindexer
file charconfuslist from charconfuslist_forindexer
file corpusfocilist from corpusfocilist
val virtualenv from params.virtualenv
val low from params.low
val high from params.high
output:
file "${corpusfreqlist}.indexNT" into index
script:
"""
#!/bin/bash
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
TICCL-indexerNT --hash "${anahashlist}" --charconf "${charconfuslist}" --foci "${corpusfocilist}" -o "${corpusfreqlist}" -t ${task.cpus} --low ${low} --high ${high} || exit 1
if [ ! -e "${corpusfreqlist}.indexNT" ]; then
echo "ERROR: Expected output ${corpusfreqlist}.indexNT does not exist.">&2
exit 6
elif [ ! -s "${corpusfreqlist}.indexNT" ]; then
echo "ERROR: Expected output ${corpusfreqlist}.indexNT is empty. This means that no correction candidates could be found for any of the words in the input and that the pipeline finishes prematurely because no further processing can be done.">&2
exit 22
fi
"""
//NOTE: -o option is a prefix only, extension indexNT will be appended !!
}
//set up a new channel for the alphabet file for the resolved (the other one is consumed already)
alphabet_forresolver = Channel.fromPath(params.alphabet).ifEmpty("Alphabet file not found")
process resolver {
//Resolves numerical confusions back to word form confusions using TICCL-LDcalc
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (rather than deleting this intermediate output)
label "multicore"
input:
file index from index
file anahashlist from anahashlist_forresolver
file corpusfreqlist from corpusfreqlist_clean_forresolver
file alphabet from alphabet_forresolver
val distance from params.distance
val artifrq from params.artifrq
val virtualenv from params.virtualenv
val low from params.low
val high from params.high
output:
file "${corpusfreqlist}.ldcalc" into wordconfusionlist
script:
"""
#!/bin/bash
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
TICCL-LDcalc --index "${index}" --hash "${anahashlist}" --clean "${corpusfreqlist}" --LD ${distance} --artifrq ${artifrq} -o "${corpusfreqlist}.ldcalc" -t ${task.cpus} --alph ${alphabet} --low ${low} --high ${high} || exit 1
if [ ! -s "${corpusfreqlist}.ldcalc" ]; then
echo "ERROR: Expected output ${corpusfreqlist}.ldcalc does not exist or is empty">&2
exit 6
fi
"""
}
alphabet_forrank = Channel.fromPath(params.alphabet)
process rank {
/*
Rank output using TICCL-rank
*/
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (rather than deleting this intermediate output)
label "multicore"
input:
file wordconfusionlist from wordconfusionlist
file alphabet from alphabet_forrank
file charconfuslist from charconfuslist_forrank
val distance from params.distance
val artifrq from params.artifrq
val clip from params.clip
val virtualenv from params.virtualenv
output:
file "${wordconfusionlist}.ranked" into rankedlist
script:
"""
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
TICCL-rank --alph "${alphabet}" --charconf "${charconfuslist}" -o "${wordconfusionlist}.ranked" --subtractartifrqfeature2 0 --clip ${clip} --skipcols=1,10,11,13 -t ${task.cpus} "${wordconfusionlist}" || exit 1
if [ ! -s "${wordconfusionlist}.ranked" ]; then
echo "ERROR: Expected output ${wordconfusionlist}.ranked does not exist or is empty">&2
exit 6
fi
"""
}
alphabet_forchain = Channel.fromPath(params.alphabet)
process chainer {
/*
Find more distant variants (variants-of-variants are variants too)
*/
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (rather than deleting this intermediate output)
input:
file rankedlist from rankedlist
file alphabet from alphabet_forchain
val virtualenv from params.virtualenv
val clip from params.clip
output:
file "${rankedlist}.chained.ranked" into rankedlist_chained
script:
"""
#!/bin/bash
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
TICCL-chain --caseless ${rankedlist} --alph ${alphabet} || exit 1
mv ${rankedlist}.chained ${rankedlist}.chained.ranked || exit 2 #FoLiA-correct requires extension to be *.ranked so we add it
if [ ! -s "${rankedlist}.chained.ranked" ]; then
echo "ERROR: Expected output ${rankedlist}.chained.ranked does not exist or is empty">&2
exit 6
fi
"""
}
if (params.chainclean) {
lexicon_forchainclean = Channel.fromPath(params.lexicon).ifEmpty("Lexicon file not found")
process chainclean {
/*
Clean chain file, taking into account splits and merges
*/
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (rather than deleting this intermediate output)
input:
file rankedlist from rankedlist_chained
file lexicon from lexicon_forchainclean
val virtualenv from params.virtualenv
val artifrq from params.artifrq
val low from params.low
output:
file "${rankedlist}.chained.ranked.cleaned" into rankedlist_chained_cleaned
script:
"""
#!/bin/bash
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
TICCL-chainclean --lexicon ${lexicon} --low ${low} --artifrq ${artifrq} ${rankedlist}
if [ ! -s "${rankedlist}.chained.ranked.cleaned" ]; then
echo "ERROR: Expected output ${rankedlist}.chained.ranked does not exist or is empty">&2
exit 6
fi
"""
}
} else {
rankedlist_chained_cleaned = rankedlist_chained
}
if (!params.containsKey('nofoliacorrect')) {
process foliacorrect {
/*
Correct the input documents using the ranked list, produces final output documents with <str>, using FoLiA-correct
*/
publishDir params.outputdir, mode: 'copy', overwrite: true
label "multicore"
input:
file folia_ocr_documents from folia_ocr_documents_forfoliacorrect.collect() //collects all files first
file rankedlist from rankedlist_chained_cleaned
file punctuationmap from punctuationmap
file unknownfreqlist from unknownfreqlist
val extension from params.extension
val inputclass from inputclass
val outputclass from params.outputclass
val virtualenv from params.virtualenv
output:
file "*.foliacorrect.folia.xml" into foliacorrect_documents
script:
"""
#!/bin/bash
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
#some bookkeeping
mkdir outputdir
FoLiA-correct --inputclass "${inputclass}" --outputclass "${outputclass}" --nums 10 -e ${extension} -O outputdir/ --unk "${unknownfreqlist}" --punct "${punctuationmap}" --rank "${rankedlist}" -t ${task.cpus} . || exit 1
cd outputdir
echo "output files:"
ls
#rename files so they have *.ticcl.folia.xml as extension (rather than .ticcl.xml which FoLiA-correct produces)
for f in *.xml; do
if [[ \$f != "*.xml" ]]; then
if [[ \${f%.ticcl.xml} != \$f ]]; then
newf="\${f%.ticcl.xml}.foliacorrect.folia.xml" #old folia-correc
elif [[ \${f%.ticcl.folia.xml} != \$f ]]; then
newf="\${f%.ticcl.folia.xml}.foliacorrect.folia.xml" #new folia-correct
else
newf="\$f"
fi
mv \$f ../\$newf
fi
done
cd ..
"""
}
if (!params.containsKey('nostringlinking')) {
process linkstrings {
/*
This invokes a tool that adds text markup information (t-str and t-correction) linking to the substrings. It adds a level of redundancy that is needed for proper visualisation in FLAT.
*/
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (this is the final output)
input:
file foliadoc from foliacorrect_documents
val virtualenv from params.virtualenv
output:
file "*.ticcl.folia.xml" into folia_ticcl_documents
script:
"""
#!/bin/bash
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
foliatextcontent -M ${foliadoc} > ${foliadoc.simpleName}.ticcl.folia.xml || exit 1
"""
}
} else {
process nolinkstrings {
"""Simple file rename step"""
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (this is the final output)
input:
file foliadoc from foliacorrect_documents
output:
file "*.ticcl.folia.xml" into folia_ticcl_documents
script:
"""
cp ${foliadoc} ${foliadoc.simpleName}.ticcl.folia.xml || exit 1
"""
}
}
//explicitly report the final documents created to stdout
folia_ticcl_documents.subscribe { println "TICCL output document written to " + params.outputdir + "/" + it.name }
}