unipept · pverscha · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024
diff --git a/scripts/build_database.sh b/scripts/build_database.sh
@@ -33,8 +33,8 @@ Required parameters:
 
   * DB_SOURCES: List of UniProt source URLs. The items in this list should be delimited by comma's. Commonly used
   databases and their corresponding sources are:
-    - swissprot: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
-    - trembl: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz
+    - swissprot: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
+    - trembl: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.dat.gz
 
   * OUTPUT_DIR: Directory in which the tsv.lz4-files that are produced by this script will be stored.
 
@@ -99,6 +99,10 @@ terminateAndExit() {
 # an error has occurred and will properly exit the script.
 errorAndExit() {
 	echo "Error: the script experienced an error while trying to build the requested database." 1>&2
+	if [[ -n "$1" ]]
+	then
+	  echo "Error details: $1" 1>&2
+  fi
 	echo "" 1>&2
 	clean
 	exit 2
@@ -256,7 +260,6 @@ INTDIR="$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT" # Where should I store intermediate TS
 KMER_LENGTH=9 # What is the length (k) of the K-mer peptides?
 CMD_SORT="sort --buffer-size=$SORT_MEMORY --parallel=4" # Which sort command should I use?
 CMD_GZIP="pigz -" # Which pipe compression command should I use for .gz files?
-CMD_ZCAT="pigz -dc" # Which decompression command should I use for .gz files?
 CMD_LZ4="lz4 -c" # Which pipe compression command should I use for .lz4 files?
 CMD_LZ4CAT="lz4 -dc" # Which decompression command should I use for .lz4 files?
 ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez?
@@ -299,7 +302,7 @@ guz() {
 	fifo="$(uuidgen)-$(basename "$1")"
 	mkfifo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo"
 	echo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo"
-	{ $CMD_ZCAT "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null &
+	{ pigz -dc "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null &
 }
 
 lz() {
@@ -402,6 +405,17 @@ download_and_convert_all_sources() {
 
     mkdir -p "$DB_INDEX_OUTPUT"
 
+    # The parser that should be used, depends on the filetype of the database that's been provided to this script.
+    if [[ $DB_SOURCE == *xml.gz ]]
+    then
+      PARSER="xml-parser"
+    elif [[ $DB_SOURCE == *dat.gz ]]
+    then
+      PARSER="dat-parser"
+    else
+      errorAndExit "No known parser available for provided UniProtKB file format. Only XML and DAT are available."
+    fi
+
     # No ETags or other header requests are available if a database is requested from the UniProt REST API. That's why
     # we always need to reprocess the database in that case.
     if [[ $DB_SOURCE =~ "rest" ]]
@@ -414,7 +428,7 @@ download_and_convert_all_sources() {
 
       reportProgress -1 "Downloading database index for $DB_TYPE." 3
 
-      curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | $CURRENT_LOCATION/helper_scripts/xml-parser -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
+      curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | pigz -dc | $CURRENT_LOCATION/helper_scripts/$PARSER -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
 
       # Now, compress the different chunks
       CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk")
@@ -458,7 +472,7 @@ download_and_convert_all_sources() {
 
         SIZE="$(curl -I "$DB_SOURCE" -s | grep -i content-length | tr -cd '[0-9]')"
 
-        curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | $CURRENT_LOCATION/helper_scripts/xml-parser -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
+        curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | pigz -dc | $CURRENT_LOCATION/helper_scripts/$PARSER -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
 
         # Now, compress the different chunks
         CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk")
@@ -537,11 +551,11 @@ create_most_tables() {
 
   log "Started sorting peptides table"
 
-  $CMD_LZ4CAT $INTDIR/peptides-out.tsv.lz4 \
+  $CMD_LZ4CAT "$INTDIR/peptides-out.tsv.lz4" \
     | LC_ALL=C $CMD_SORT -k2 \
-    | $CMD_LZ4 > $INTDIR/peptides-equalized.tsv.lz4
+    | $CMD_LZ4 > "$INTDIR/peptides-equalized.tsv.lz4"
 
-  rm $INTDIR/peptides-out.tsv.lz4
+  rm "$INTDIR/peptides-out.tsv.lz4"
   log "Finished calculation of most tables with status $?"
 }
 
@@ -557,8 +571,8 @@ number_sequences() {
 	mkfifo "p_eq"
 	mkfifo "p_or"
 
-	$CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 | cut -f 3 | sort | uniq > "p_or" &
-	$CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 | cut -f 2 | uniq > "p_eq" &
+	$CMD_LZ4CAT "$INTDIR/peptides-equalized.tsv.lz4" | cut -f 3 | sort | uniq > "p_or" &
+	$CMD_LZ4CAT "$INTDIR/peptides-equalized.tsv.lz4" | cut -f 2 | uniq > "p_eq" &
 
 	sort -u -m "p_or" "p_eq" | cat -n \
 		| sed 's/^ *//' | $CMD_LZ4 - > "$INTDIR/sequences.tsv.lz4"
@@ -572,7 +586,7 @@ substitute_aas() {
   have "$INTDIR/peptides-equalized.tsv.lz4" "$INTDIR/sequences.tsv.lz4"
 
   log "Started the substitution of equalized AA's by ID's for the peptides."
-  $CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 \
+  $CMD_LZ4CAT "$INTDIR/peptides-equalized.tsv.lz4" \
     | join -t '	' -o '1.1,2.1,1.3,1.4,1.5,1.6' -1 2 -2 2 - "$(luz "$INTDIR/sequences.tsv.lz4")" \
     | $CMD_LZ4 - > "$INTDIR/peptides_by_equalized.tsv.lz4"
 
@@ -591,7 +605,7 @@ substitute_aas() {
 calculate_equalized_lcas() {
 	have "$INTDIR/peptides_by_equalized.tsv.lz4" || return
 	log "Started the calculation of equalized LCA's."
-	$CMD_LZ4CAT $INTDIR/peptides_by_equalized.tsv.lz4 | cut -f 2,6 \
+	$CMD_LZ4CAT "$INTDIR/peptides_by_equalized.tsv.lz4" | cut -f 2,6 \
 		| $CURRENT_LOCATION/helper_scripts/lcas --infile "$(luz "$OUTPUT_DIR/lineages.tsv.lz4")" \
 		| $CMD_LZ4 - > "$INTDIR/LCAs_equalized.tsv.lz4"
 	log "Finished the calculation of equalized LCA's (after substituting AA's by ID's) with status $?."
@@ -601,7 +615,7 @@ calculate_equalized_lcas() {
 calculate_original_lcas() {
 	have "$INTDIR/peptides_by_original.tsv.lz4" || return
 	log "Started the calculation of original LCA's"
-	$CMD_LZ4CAT $INTDIR/peptides_by_original.tsv.lz4 | cut -f 3,6 \
+	$CMD_LZ4CAT "$INTDIR/peptides_by_original.tsv.lz4" | cut -f 3,6 \
 		| $CURRENT_LOCATION/helper_scripts/lcas --infile "$(luz "$OUTPUT_DIR/lineages.tsv.lz4")" \
 		| $CMD_LZ4 - > "$INTDIR/LCAs_original.tsv.lz4"
 	log "Finished the calculation of original LCA's (after substituting AA's by ID's) with status $?."

diff --git a/scripts/load.sh b/scripts/load.sh
@@ -12,10 +12,10 @@ function print {
 
 cd "$dir"
 
-for file in *.tsv.gz
+for file in *.tsv.lz4
 do
     print $file
-    tbl=`echo $file | sed "s/.tsv.gz//"`
+    tbl=`echo $file | sed "s/.tsv.lz4//"`
     echo "zcatting - LOAD DATA LOCAL INFILE '$file' INTO TABLE $tbl"
     zcat $file | mariadb  --local-infile=1 -uunipept -punipept $db -e "LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE $tbl;SHOW WARNINGS" 2>&1
 done