forked from knaw-huc/loghi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create-train-data.sh
executable file
·117 lines (98 loc) · 3.15 KB
/
create-train-data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/bin/bash
VERSION=2.1.4
set -e
set -o pipefail
# User-configurable parameters
# Percentage split for training and validation sets
trainsplit=90
# Include text styles in the output
include_text_styles=1
# Skip unclear text regions
skip_unclear=1
# DO NOT MODIFY BELOW THIS LINE
# ------------------------------
# Function to provide usage instructions
function usage() {
echo "Usage: create-train-data.sh <input_path> <output_path> [numthreads]"
echo "input_path: path to images and pagexml to be converted. The pageXML must be one level deeper"
echo " than the images in a directory called \"page\""
echo "output_path: path to store the training data"
echo "numthreads: number of threads to use for processing the images (default: 4)"
}
# Argument validation
if [ "$#" -lt 2 ]; then
echo "Illegal number of parameters"
usage
exit 1
fi
# Set default number of threads and allow override
numthreads=4
if [ -n "$3" ]; then
numthreads=$3
echo "Setting numthreads=$numthreads"
fi
# Obtain absolute paths for input and output directories
inputdir=$(realpath $1/)
outputdir=$(realpath $2/)
mkdir -p $outputdir
# Prepare file lists
filelist=$outputdir/training_all.txt
filelisttrain=$outputdir/training_all_train.txt
filelistval=$outputdir/training_all_val.txt
# Docker image for Loghi tooling
DOCKERLOGHITOOLING=loghi/docker.loghi-tooling:$VERSION
# Flags for Loghi processing
# Check user input and set flag accordingly
if [[ $include_text_styles -eq 1 ]]; then
INCLUDETEXTSTYLES=" -include_text_styles "
else
INCLUDETEXTSTYLES=""
fi
if [[ $skip_unclear -eq 1 ]]; then
SKIP_UNCLEAR=" -skip_unclear "
else
SKIP_UNCLEAR=""
fi
# Housekeeping: remove any existing *.done files
find $inputdir -name '*.done' -exec rm {} \;
# Informative output
echo "Input directory: $inputdir"
echo "Output directory: $outputdir"
echo "File lists:"
echo " All: $filelist"
echo " Training: $filelisttrain"
echo " Validation: $filelistval"
echo "Input files: $(find $inputdir | wc -l)"
# Run Loghi's MinionCutFromImageBasedOnPageXMLNew in Docker
echo "Running image segmentation and text extraction..."
docker run -u $(id -u ${USER}):$(id -g ${USER}) --rm \
-v $inputdir/:$inputdir/ \
-v $outputdir:$outputdir \
$DOCKERLOGHITOOLING \
/src/loghi-tooling/minions/target/appassembler/bin/MinionCutFromImageBasedOnPageXMLNew \
-input_path $inputdir \
-outputbase $outputdir \
-channels 4 \
-output_type png \
-write_text_contents \
-threads $numthreads \
$INCLUDETEXTSTYLES \
-no_page_update \
$SKIP_UNCLEAR \
-use_2013_namespace
echo "Output files: $(find $outputdir | wc -l)"
# Generate the image/text pair list
echo "Generating file lists..."
> $filelist
for input_path in $(find $outputdir -name '*.png'); do
filename=$(basename -- "$input_path")
filename="${filename%.*}"
base="${input_path%.*}"
text=$(cat $base.txt)
echo -e "$input_path\t$text" >> $filelist
done
# Create training and validation file lists
echo "Splitting data into training and validation sets..."
shuf $filelist | split -l $(( $(wc -l <$filelist) * $trainsplit / 100 ))
mv xab $filelistval
mv xaa $filelisttrain