Skip to content

Commit

Permalink
Merge pull request #48 from unipept/fix/parser-check
Browse files Browse the repository at this point in the history
Improve the check for the type of parser that should be used
  • Loading branch information
pverscha authored Mar 18, 2024
2 parents 1bbc3fd + 1be5ebe commit 53e816e
Showing 1 changed file with 19 additions and 2 deletions.
21 changes: 19 additions & 2 deletions scripts/build_database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,23 @@ create_taxon_tables() {
log "Finished creating the taxon tables."
}

url_points_to_xml() {
URL="$1"

MATCH=$(curl -s "$URL"| gunzip | head | grep '^<?xml')

# Use curl to download the first 1KB of the file and attempt to decompress it
# Check if the decompressed output starts with an XML declaration or seems like an XML file
if [[ -n "$MATCH" ]]
then
# This is an XML file, success!
return 0
else
# This is not an XML file, return non-zero exit code
return 1
fi
}

download_and_convert_all_sources() {
IDX=0

Expand Down Expand Up @@ -406,7 +423,7 @@ download_and_convert_all_sources() {
mkdir -p "$DB_INDEX_OUTPUT"

# The parser that should be used, depends on the filetype of the database that's been provided to this script.
if [[ $DB_SOURCE == *xml.gz ]]
if [[ $DB_SOURCE == *xml.gz ]] || url_points_to_xml "$DB_SOURCE"
then
PARSER="xml-parser"
elif [[ $DB_SOURCE == *dat.gz ]]
Expand All @@ -428,7 +445,7 @@ download_and_convert_all_sources() {

reportProgress -1 "Downloading database index for $DB_TYPE." 3

curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | pigz -dc | $CURRENT_LOCATION/helper_scripts/$PARSER -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pigz -dc | $CURRENT_LOCATION/helper_scripts/$PARSER -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"

# Now, compress the different chunks
CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk")
Expand Down

0 comments on commit 53e816e

Please sign in to comment.