Merge pull request #48 from unipept/fix/parser-check

Improve the check for the type of parser that should be used
unipept · Mar 18, 2024 · 53e816e · 53e816e
2 parents 1bbc3fd + 1be5ebe
commit 53e816e
Showing 1 changed file with 19 additions and 2 deletions.
diff --git a/scripts/build_database.sh b/scripts/build_database.sh
@@ -379,6 +379,23 @@ create_taxon_tables() {
 	log "Finished creating the taxon tables."
 }
 
+url_points_to_xml() {
+  URL="$1"
+
+  MATCH=$(curl -s "$URL"| gunzip | head | grep '^<?xml')
+
+  # Use curl to download the first 1KB of the file and attempt to decompress it
+  # Check if the decompressed output starts with an XML declaration or seems like an XML file
+  if [[ -n "$MATCH" ]]
+  then
+    # This is an XML file, success!
+    return 0
+  else
+    # This is not an XML file, return non-zero exit code
+    return 1
+  fi
+}
+
 download_and_convert_all_sources() {
   IDX=0
 
@@ -406,7 +423,7 @@ download_and_convert_all_sources() {
     mkdir -p "$DB_INDEX_OUTPUT"
 
     # The parser that should be used, depends on the filetype of the database that's been provided to this script.
-    if [[ $DB_SOURCE == *xml.gz ]]
+    if [[ $DB_SOURCE == *xml.gz ]] || url_points_to_xml "$DB_SOURCE"
     then
       PARSER="xml-parser"
     elif [[ $DB_SOURCE == *dat.gz ]]
@@ -428,7 +445,7 @@ download_and_convert_all_sources() {
 
       reportProgress -1 "Downloading database index for $DB_TYPE." 3
 
-      curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | pigz -dc | $CURRENT_LOCATION/helper_scripts/$PARSER -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
+      curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pigz -dc | $CURRENT_LOCATION/helper_scripts/$PARSER -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
 
       # Now, compress the different chunks
       CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk")