-
Notifications
You must be signed in to change notification settings - Fork 6
/
data-to-dict.sh
executable file
·78 lines (57 loc) · 3.58 KB
/
data-to-dict.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/bin/bash
# Import MySQL dumps to local MySQL, merge with existing dict (german.dict), export as CSV, transform CSV to Morfologik binary
# for use in e.g. LanguageTool
DBUSER="root"
DBPASS=""
LT_PATH="/prg/LanguageTool-5.0"
# the value of MySQL's secure_file_priv setting - only SQL files in here can be imported:
IMPORT_DIR=/var/lib/mysql-files
if [ ! -d $LT_PATH ]
then
echo "Error: LT_PATH does not exist: $LT_PATH"
exit
fi
if [ ! -d $IMPORT_DIR ]
then
echo "Error: IMPORT_DIR does not exist: $IMPORT_DIR"
exit
fi
gunzip verben.sql.gz nomen.sql.gz adjektive.sql.gz
mysqladmin -u $DBUSER --password=$DBPASS create flexiontmp || { echo "Stopping due to previous error"; exit; }
echo "Starting MySQL import..."
mysql -u $DBUSER --password=$DBPASS flexiontmp <verben.sql
mysql -u $DBUSER --password=$DBPASS flexiontmp <nomen.sql
mysql -u $DBUSER --password=$DBPASS flexiontmp <adjektive.sql
function dbimport {
echo "Running SQL to export data to CSV:"
cat $IMPORT_DIR/tmp.sql
echo "Running: mysql -u $DBUSER --password=$DBPASS flexiontmp <$IMPORT_DIR/tmp.sql"
mysql -u $DBUSER --password=$DBPASS flexiontmp <$IMPORT_DIR/tmp.sql
rm $IMPORT_DIR/tmp.sql
}
rm -f $IMPORT_DIR/output-verben.csv $IMPORT_DIR/output-nomen.csv $IMPORT_DIR/output-adjektive.csv
cat csv.sql | sed "s@_OUTPUT_@$IMPORT_DIR\/output-verben.csv@" | sed 's/_TABLE_/verben/' >$IMPORT_DIR/tmp.sql
dbimport
cat csv.sql | sed "s@_OUTPUT_@$IMPORT_DIR\/output-nomen.csv@" | sed 's/_TABLE_/nomen/' >$IMPORT_DIR/tmp.sql
dbimport
cat csv.sql | sed "s@_OUTPUT_@$IMPORT_DIR\/output-adjektive.csv@" | sed 's/_TABLE_/adjektive/' >$IMPORT_DIR/tmp.sql
dbimport
# ^\t -> some items have empty forms (because they are deleted), filter them:
grep -v -P '^\t' $IMPORT_DIR/output-verben.csv | python3 ./transform-pos.py >/tmp/output-verben-reordered.csv
# export existing dict first, so it can be added:
java -cp $LT_PATH/languagetool.jar org.languagetool.tools.DictionaryExporter -i src/main/resources/org/languagetool/resource/de/german.dict -info src/main/resources/org/languagetool/resource/de/german.info -o old_version_dump.txt
sed 's/ \+$//' old_version_dump.txt >old_version_dump_trimmed.txt
cat old_version_dump_trimmed.txt src/main/resources/org/languagetool/resource/de/EIG.txt src/main/resources/org/languagetool/resource/de/sonstige.txt \
/tmp/output-verben-reordered.csv $IMPORT_DIR/output-nomen.csv $IMPORT_DIR/output-adjektive.csv | sort | uniq | grep -v -P '^\t' >output-all.csv
sed -i 's/:HO[0-9]//' output-all.csv
echo "Size of dictionary as plain text:"
ls -lh output-all.csv
echo "Building POS dictionary, using src/main/resources/org/languagetool/resource/de/german.info:"
java -cp $LT_PATH/languagetool.jar org.languagetool.tools.POSDictionaryBuilder -i output-all.csv -info src/main/resources/org/languagetool/resource/de/german.info -o src/main/resources/org/languagetool/resource/de/german.dict
echo "Building synth dictionary, using src/main/resources/org/languagetool/resource/de/german_synth.info:"
java -cp $LT_PATH/languagetool.jar org.languagetool.tools.SynthDictionaryBuilder -i output-all.csv -info src/main/resources/org/languagetool/resource/de/german_synth.info -o src/main/resources/org/languagetool/resource/de/german_synth.dict
LANG=C awk 'BEGIN {FS="\t"} {print $3}' output-all.csv | sort | uniq >src/main/resources/org/languagetool/resource/de/german_tags.txt
echo "Cleaning up temp files..."
rm output-all.csv $IMPORT_DIR/output-verben.csv $IMPORT_DIR/output-verben-reordered.csv $IMPORT_DIR/output-nomen.csv $IMPORT_DIR/output-adjektive.csv
echo "Dropping temp database..."
mysqladmin -u $DBUSER --password=$DBPASS drop flexiontmp