forked from HogeBlekker/load_patstat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_patstat.sh
executable file
·259 lines (218 loc) · 7.34 KB
/
load_patstat.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/env bash
# A POSIX variable
OPTIND=1 # Reset in case getopts has been used previously in the shell.
# Initialize our own variables:
MYSQLDATAPATH="/var/lib/mysql"
ZIPFILESPATH=./data
LOGPATH=./logs
verbose=0
DEMO=0
USER=
PASS=
HOST=
DB=
TMPDIR=/dev/shm
function show_help() {
echo "Usage: [-v] [-t] -u mysql_user -p mysql_pass -h mysql_host -d mysql_dbname -z patstat_zips_dir -e tmp_dir -m mysql_data_path"
echo " -v: be verbose"
echo " -t: load small chunks of data for testing purposes"
echo " -z: directory containing patstat zipped files shipped in DVDs (defaults to $ZIPFILESPATH)"
echo " -o: output and error logs directory (defaults to $LOGPATH)"
echo " -e: temp dir to extract the zip files (defaults to $TMPDIR)"
echo " -m: directory of mysql data, used for running myisamchk and myisampack (defaults to $MYSQLDATAPATH)"
}
while getopts "?vto:u:p:d:h:z:m:e:" opt; do
case "$opt" in
\?)
show_help
exit 0
;;
v) verbose=1
;;
o) LOGPATH=$OPTARG
;;
t) DEMO=1
;;
u) USER=$OPTARG
;;
p) PASS=$OPTARG
;;
h) HOST=$OPTARG
;;
d) DB=$OPTARG
;;
z) ZIPFILESPATH=$OPTARG
;;
e) TMPDIR=$OPTARG
;;
m) MYSQLDATAPATH=$OPTARG
;;
esac
done
shift $((OPTIND-1))
[ "$1" = "--" ] && shift
if [[ -z $USER ]] || [[ -z $PASS ]] || [[ -z $HOST ]] || [[ -z $DB ]] || [[ -z $ZIPFILESPATH ]] || [[ -z $TMPDIR ]]
then
show_help
exit 1
fi
if [[ ! $verbose -eq 0 ]]
then
echo "user: $USER pass: $PASS host: $HOST database: $DB"
echo "zipped files path: $ZIPFILESPATH"
echo "verbose=$verbose, test=$DEMO leftovers: $@"
fi
if [ ! -d $ZIPFILESPATH ]; then
echo ERROR: path $ZIPFILESPATH does not exist
exit
fi
SENDSQL="mysql -vv --show-warnings --local-infile -u$USER -p$PASS -h$HOST $DB"
function create_db() {
./tools/create_schema.sh $DB | mysql -vv --show-warnings -u$USER -p$PASS -h$HOST
echo FLUSH TABLES \; | $SENDSQL
}
load_table() {
TIME=$(date '+%F %T %Z')
INTIME=$(date +%s)
# This removes all use of indexes for the table.
# An option value of 0 disables updates to all indexes, which can be used to get faster inserts.
echo TRUNCATE TABLE $1 \; | $SENDSQL
echo ALTER TABLE $1 DISABLE KEYS\; | $SENDSQL ;
myisamchk --keys-used=0 -rqp $MYSQLDATAPATH/$DB/$1*.MYI
echo $TIME Loading data in $1 from $3 files
# all files containing data for the current table
EXPECTED_ROWCOUNT=0
# some rows are buggy, that is, they contain a backslash just before the last double quote
# e.g.,
# 6597821,"US",664004,"CellTech Power, Inc.","Westboro,MA,\"
# so we must fix this and we use sed regexp replacement
# the original sed expr is
# sed -e 's/\\\("[^\"]$\)/\1/g'
# but we've to add some extra quotes in order to put the command in a shell variable
SED_FIX_1=`echo sed -e 's/\\\\\\("[^\"]$\\)/\1/g'`
# other rows are bugged as well since the cotain a backslash just before some double quote
# separating different columns
# e.g.,
# 8638854,"",4318,"BROTHER KOGYO KABUSHIKI KAISHA\",""
# ... ,"COMPANY",108638854,"BROTHER KOGYO KABUSHIKI KAISHA\",0
# so again we've to fix it using sed. The original sed expr used is:
# sed -e 's/\\\(",[0-9\"]\)/\1/g'
# the escaped expression is
SED_FIX_2=`echo sed -e 's/\\\\\\(",[0-9\"]\\)/\1/g'`
prefix=$(echo $1 | cut -d'_' -f 1) # grab only the prefix, e.g. tls201, from the full table name
for ZIPPEDFILE in `find $ZIPFILESPATH -name "$prefix\_part*\.zip" | sort`
do
echo loading part file $ZIPPEDFILE
UNZIPPEDFILE=$TMPDIR/`basename $ZIPPEDFILE`.txt
if [ $DEMO -eq 1 ]
then
funzip $ZIPPEDFILE | head -n 10000 | $SED_FIX_1 | $SED_FIX_2 > $UNZIPPEDFILE
else
funzip $ZIPPEDFILE | $SED_FIX_1 | $SED_FIX_2 > $UNZIPPEDFILE
fi
let "EXPECTED_ROWCOUNT = EXPECTED_ROWCOUNT + `awk 'END { print NR }' $UNZIPPEDFILE` - 1"
# echo $EXPECTED_ROWCOUNT
$SENDSQL <<EOF
set autocommit = 0;
set unique_checks = 0;
set foreign_key_checks = 0;
LOAD DATA LOCAL INFILE "$UNZIPPEDFILE"
INTO TABLE $1 FIELDS TERMINATED BY ","
OPTIONALLY ENCLOSED BY '"'
ESCAPED BY ''
LINES TERMINATED BY '\r\n'
IGNORE 1 LINES;
commit;
SHOW WARNINGS;
EOF
rm -rf $UNZIPPEDFILE
done
echo ALTER TABLE $1 ENABLE KEYS \; | $SENDSQL ;
# If you intend only to read from the table in the future, use myisampack to compress it.
# only if it was not partitioned
echo "compressing"
myisampack $MYSQLDATAPATH/$DB/$1.MYI
# Re-create the indexes
myisamchk -rqp --sort-buffer-size=2G $MYSQLDATAPATH/$DB/$1*.MYI
# FLUSH TABLES
echo FLUSH TABLES \; | $SENDSQL
echo "no. of rows inserted into $1: `echo SELECT COUNT\(\*\) FROM $1 | $SENDSQL` (expected: $EXPECTED_ROWCOUNT)"
OUTTIME=$(date +%s)
echo " $OUTTIME - $INTIME = " $(( $OUTTIME - $INTIME )) sec " = " $(( ( $OUTTIME - $INTIME ) / 60 )) min
# read -p 'waiting...'
}
function main(){
# creates an empty database schema
create_db
# loads official patstat tables
load_table tls201_appln
load_table tls202_appln_title
load_table tls204_appln_prior
load_table tls205_tech_rel
load_table tls206_person
load_table tls906_person # this is person table with harmonized names
load_table tls207_pers_appln
load_table tls209_appln_ipc
load_table tls210_appln_n_cls
load_table tls211_pat_publn
load_table tls212_citation
load_table tls214_npl_publn
load_table tls215_citn_categ
load_table tls216_appln_contn
load_table tls222_appln_jp_class
load_table tls223_appln_docus
load_table tls224_appln_cpc
load_table tls226_person_orig
load_table tls227_pers_publn
load_table tls228_docdb_fam_citn
load_table tls229_appln_nace2
load_table tls230_appln_techn_field
load_table tls801_country
load_table tls802_legal_event_code
load_table tls901_techn_field_ipc
load_table tls902_ipc_nace2
load_table tls203_appln_abstr
load_table tls221_inpadoc_prs
# finally, prints out some statistics on loaded tables
$SENDSQL <<EOF
SELECT table_name, table_rows
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_SCHEMA = '$DB'
;
EOF
}
tstamp=`date +"%Y-%m-%d"`
# call the main function and record both std out and std err
main 2> $LOGPATH/error_log_$tstamp > $LOGPATH/output_log_$tstamp
# check of errors
errlines=`wc -l $LOGPATH/error_log_$tstamp | cut -d' ' -f1`
if [ $errlines -gt 0 ]
then
if [ $errlines -le 3 ]
then
echo "THE FOLLOWING ERRORS HAVE BEEN DETECTED: "
cat $LOGPATH/error_log_$tstamp
else
echo "SOME ERRORS OCCURRED."
echo "IT MAY BE SAFE TO IGNORE THEM, BUT PLEASE CHECK FILE $LOGPATH/error_log_$tstamp"
fi
echo
fi
# check for warnings
warnlines=`cat $LOGPATH/output_log_$tstamp | grep Warning | wc -l`
if [ $warnlines -gt 0 ]
then
if [ $warnlines -lt 10 ]
then
echo "THE FOLLOWING MySQL WARNINGS HAVE BEEN GENERATED: "
cat cat $LOGPATH/output_log_$tstamp | grep Warning
else
echo "SOME MySQL WARNINGS HAVE BEEN GENERATED."
echo "IT MAY BE SAFE TO IGNORE THEM, BUT PLEASE CHECK FILE $LOGPATH/output_log_$tstamp"
fi
fi
echo
echo "CREATED TABLES HAVE THE FOLLOWING NUMBERS OF ROWS"
echo "PLEASE CHECK THEM AGAINST PATSTAT DOCUMENTATION"
# output table counters to console
grep -E "^tls.*$" $LOGPATH/output_log_$tstamp