Skip to content

Commit

Permalink
Add rdf2hdtcat.sh from hdt-java
Browse files Browse the repository at this point in the history
This is not in the package. This includes my patches to the upstream
script at <rdfhdt/hdt-java#201>.
  • Loading branch information
zmughal committed Dec 28, 2023
1 parent 39faf06 commit 7ba2b70
Showing 1 changed file with 106 additions and 0 deletions.
106 changes: 106 additions & 0 deletions bin/hdt-java-rdf2hdtcat.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/bin/bash

function showhelp {
echo
echo "Script to serialize a big RDF file in n-triples format into HDT"
echo "It splits the file in N parts, compress each one with rdf2hdt, and merges them iteratively with hdtCat."
echo
echo "Usage $0 [OPTION]"
echo
echo " -c, --catscript location of hdtCat script (assuming it's in PATH by by default)"
echo " -i, --input input file (input.rdf by default)"
echo " -h, --help shows this help and exits"
echo " -n, --number number of files to split FILE (2 by default)"
echo " -o, --output output file (output.hdt by default)"
echo " -p, --parallel number of threads to serialize RDF into HDT in parallel (1 by default)"
echo " -r, --rdf2hdt location of rdf2hdt script (assuming it's in PATH by default)"
echo
}

# Defaults
declare rdf2hdt="rdf2hdt.sh"
declare hdtCat="hdtCat.sh"
declare input="input.rdf"
declare -i lines
declare output="output.hdt"
declare -i splits=2
declare -i threads=1

getopt --test > /dev/null
if [[ $? -eq 4 ]]; then
# enhanced getopt works
OPTIONS=c:i:hn:o:p:r:
LONGOPTIONS=cat:,input:,help,number:,output:,parallel:,rdf2hdt:
COMMAND=$(getopt -o $OPTIONS -l $LONGOPTIONS -n "$0" -- "$@")
if [[ $? -ne 0 ]]; then
exit 2
fi
eval set -- "$COMMAND"
else
echo "Enhanced getopt not supported. Brace yourself, this is not tested, but it should work :-)"
fi

while true; do
case "$1" in
-c|--cat)
hdtCat=$2
shift 2
;;
-i|--input)
input=$2
shift 2
;;
-n|--number)
splits=$2
shift 2
;;
-o|--output)
output=$2
shift 2
;;
-p|--parallel)
threads=$2
shift 2
;;
-r|--rdf2hdt)
rdf2hdt=$2
shift 2
;;
--)
shift
break
;;
*)
showhelp
exit 0
;;
esac
done

total_lines=$(wc -l < $input)
lines=($total_lines+$splits-1)/$splits #Set number of lines rounding up

split -l $lines $input "$input"_split_

echo "***************************************************************"
echo "Serializing into HDT $splits files using $threads threads"
echo "***************************************************************"
echo -n "$input"_split_* | xargs -I{} -d' ' -P$threads $rdf2hdt -rdftype ntriples {} {}_"$splits".hdt

for (( i=$splits; i>1; i=i/2 )); do
echo "***************************************************************"
echo "Merging $i hdt files: " "$input"_split_*_"$i".hdt
echo "***************************************************************"
command='temp=${2%_*.hdt} ; '$hdtCat' $1 $2 ${1%_*.hdt}_${temp#*split_}_$0.hdt'
echo -n "$input"_split_*_"$i".hdt | xargs -d' ' -n2 -P$threads bash -c "$command" $((i/2))
done

echo "***************************************************************"
echo "Moving output to '$output' file"
echo "***************************************************************"
mv "$input"_split*_1.hdt "$output"

echo "***************************************************************"
echo "Cleaning up split files"
echo "***************************************************************"
rm "$input"_split_*

0 comments on commit 7ba2b70

Please sign in to comment.