-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This is not in the package. This includes my patches to the upstream script at <rdfhdt/hdt-java#201>.
- Loading branch information
Showing
1 changed file
with
106 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#!/bin/bash | ||
|
||
function showhelp { | ||
echo | ||
echo "Script to serialize a big RDF file in n-triples format into HDT" | ||
echo "It splits the file in N parts, compress each one with rdf2hdt, and merges them iteratively with hdtCat." | ||
echo | ||
echo "Usage $0 [OPTION]" | ||
echo | ||
echo " -c, --catscript location of hdtCat script (assuming it's in PATH by by default)" | ||
echo " -i, --input input file (input.rdf by default)" | ||
echo " -h, --help shows this help and exits" | ||
echo " -n, --number number of files to split FILE (2 by default)" | ||
echo " -o, --output output file (output.hdt by default)" | ||
echo " -p, --parallel number of threads to serialize RDF into HDT in parallel (1 by default)" | ||
echo " -r, --rdf2hdt location of rdf2hdt script (assuming it's in PATH by default)" | ||
echo | ||
} | ||
|
||
# Defaults | ||
declare rdf2hdt="rdf2hdt.sh" | ||
declare hdtCat="hdtCat.sh" | ||
declare input="input.rdf" | ||
declare -i lines | ||
declare output="output.hdt" | ||
declare -i splits=2 | ||
declare -i threads=1 | ||
|
||
getopt --test > /dev/null | ||
if [[ $? -eq 4 ]]; then | ||
# enhanced getopt works | ||
OPTIONS=c:i:hn:o:p:r: | ||
LONGOPTIONS=cat:,input:,help,number:,output:,parallel:,rdf2hdt: | ||
COMMAND=$(getopt -o $OPTIONS -l $LONGOPTIONS -n "$0" -- "$@") | ||
if [[ $? -ne 0 ]]; then | ||
exit 2 | ||
fi | ||
eval set -- "$COMMAND" | ||
else | ||
echo "Enhanced getopt not supported. Brace yourself, this is not tested, but it should work :-)" | ||
fi | ||
|
||
while true; do | ||
case "$1" in | ||
-c|--cat) | ||
hdtCat=$2 | ||
shift 2 | ||
;; | ||
-i|--input) | ||
input=$2 | ||
shift 2 | ||
;; | ||
-n|--number) | ||
splits=$2 | ||
shift 2 | ||
;; | ||
-o|--output) | ||
output=$2 | ||
shift 2 | ||
;; | ||
-p|--parallel) | ||
threads=$2 | ||
shift 2 | ||
;; | ||
-r|--rdf2hdt) | ||
rdf2hdt=$2 | ||
shift 2 | ||
;; | ||
--) | ||
shift | ||
break | ||
;; | ||
*) | ||
showhelp | ||
exit 0 | ||
;; | ||
esac | ||
done | ||
|
||
total_lines=$(wc -l < $input) | ||
lines=($total_lines+$splits-1)/$splits #Set number of lines rounding up | ||
|
||
split -l $lines $input "$input"_split_ | ||
|
||
echo "***************************************************************" | ||
echo "Serializing into HDT $splits files using $threads threads" | ||
echo "***************************************************************" | ||
echo -n "$input"_split_* | xargs -I{} -d' ' -P$threads $rdf2hdt -rdftype ntriples {} {}_"$splits".hdt | ||
|
||
for (( i=$splits; i>1; i=i/2 )); do | ||
echo "***************************************************************" | ||
echo "Merging $i hdt files: " "$input"_split_*_"$i".hdt | ||
echo "***************************************************************" | ||
command='temp=${2%_*.hdt} ; '$hdtCat' $1 $2 ${1%_*.hdt}_${temp#*split_}_$0.hdt' | ||
echo -n "$input"_split_*_"$i".hdt | xargs -d' ' -n2 -P$threads bash -c "$command" $((i/2)) | ||
done | ||
|
||
echo "***************************************************************" | ||
echo "Moving output to '$output' file" | ||
echo "***************************************************************" | ||
mv "$input"_split*_1.hdt "$output" | ||
|
||
echo "***************************************************************" | ||
echo "Cleaning up split files" | ||
echo "***************************************************************" | ||
rm "$input"_split_* |