From eb633766147b92d638c8f28dc41072037666b5e6 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Thu, 14 Mar 2024 10:33:15 +0100 Subject: [PATCH 1/3] Automatically switch to the correct parser depending on the provided file type of UniProtKB --- scripts/build_database.sh | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/scripts/build_database.sh b/scripts/build_database.sh index 3aaa4c6..60e5ab1 100755 --- a/scripts/build_database.sh +++ b/scripts/build_database.sh @@ -99,6 +99,10 @@ terminateAndExit() { # an error has occurred and will properly exit the script. errorAndExit() { echo "Error: the script experienced an error while trying to build the requested database." 1>&2 + if [[ -n "$1" ]] + then + echo "Error details: $1" 1>&2 + fi echo "" 1>&2 clean exit 2 @@ -402,6 +406,17 @@ download_and_convert_all_sources() { mkdir -p "$DB_INDEX_OUTPUT" + # The parser that should be used, depends on the filetype of the database that's been provided to this script. + if [[ $DB_SOURCE == *xml.gz ]] + then + PARSER="xml-parser" + elif [[ $DB_SOURCE == *dat.gz ]] + then + PARSER="dat-parser" + else + errorAndExit "No known parser available for provided UniProtKB file format. Only XML and DAT are available." + fi + # No ETags or other header requests are available if a database is requested from the UniProt REST API. That's why # we always need to reprocess the database in that case. if [[ $DB_SOURCE =~ "rest" ]] @@ -414,7 +429,7 @@ download_and_convert_all_sources() { reportProgress -1 "Downloading database index for $DB_TYPE." 3 - curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | $CURRENT_LOCATION/helper_scripts/xml-parser -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT" + curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | "$CURRENT_LOCATION/helper_scripts/$PARSER" -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT" # Now, compress the different chunks CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk") @@ -458,7 +473,7 @@ download_and_convert_all_sources() { SIZE="$(curl -I "$DB_SOURCE" -s | grep -i content-length | tr -cd '[0-9]')" - curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | $CURRENT_LOCATION/helper_scripts/xml-parser -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT" + curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | "$CURRENT_LOCATION/helper_scripts/$PARSER" -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT" # Now, compress the different chunks CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk") From f2d989c68e6fefd8e2b90c354e71694619af2a92 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Thu, 14 Mar 2024 11:21:16 +0100 Subject: [PATCH 2/3] Update load and index script to use lz4 encoding --- scripts/load.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/load.sh b/scripts/load.sh index fd16525..f8a90c1 100755 --- a/scripts/load.sh +++ b/scripts/load.sh @@ -12,10 +12,10 @@ function print { cd "$dir" -for file in *.tsv.gz +for file in *.tsv.lz4 do print $file - tbl=`echo $file | sed "s/.tsv.gz//"` + tbl=`echo $file | sed "s/.tsv.lz4//"` echo "zcatting - LOAD DATA LOCAL INFILE '$file' INTO TABLE $tbl" zcat $file | mariadb --local-infile=1 -uunipept -punipept $db -e "LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE $tbl;SHOW WARNINGS" 2>&1 done From 81ebb82dfebc526701d86c1fdc5d1ab7be3e328e Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Thu, 14 Mar 2024 12:02:01 +0100 Subject: [PATCH 3/3] Fix some issues with missing quotes --- scripts/build_database.sh | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/scripts/build_database.sh b/scripts/build_database.sh index 60e5ab1..5e7cffc 100755 --- a/scripts/build_database.sh +++ b/scripts/build_database.sh @@ -33,8 +33,8 @@ Required parameters: * DB_SOURCES: List of UniProt source URLs. The items in this list should be delimited by comma's. Commonly used databases and their corresponding sources are: - - swissprot: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - - trembl: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz + - swissprot: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz + - trembl: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.dat.gz * OUTPUT_DIR: Directory in which the tsv.lz4-files that are produced by this script will be stored. @@ -260,7 +260,6 @@ INTDIR="$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT" # Where should I store intermediate TS KMER_LENGTH=9 # What is the length (k) of the K-mer peptides? CMD_SORT="sort --buffer-size=$SORT_MEMORY --parallel=4" # Which sort command should I use? CMD_GZIP="pigz -" # Which pipe compression command should I use for .gz files? -CMD_ZCAT="pigz -dc" # Which decompression command should I use for .gz files? CMD_LZ4="lz4 -c" # Which pipe compression command should I use for .lz4 files? CMD_LZ4CAT="lz4 -dc" # Which decompression command should I use for .lz4 files? ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez? @@ -303,7 +302,7 @@ guz() { fifo="$(uuidgen)-$(basename "$1")" mkfifo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" echo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" - { $CMD_ZCAT "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null & + { pigz -dc "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null & } lz() { @@ -429,7 +428,7 @@ download_and_convert_all_sources() { reportProgress -1 "Downloading database index for $DB_TYPE." 3 - curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | "$CURRENT_LOCATION/helper_scripts/$PARSER" -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT" + curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | pigz -dc | $CURRENT_LOCATION/helper_scripts/$PARSER -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT" # Now, compress the different chunks CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk") @@ -473,7 +472,7 @@ download_and_convert_all_sources() { SIZE="$(curl -I "$DB_SOURCE" -s | grep -i content-length | tr -cd '[0-9]')" - curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | "$CURRENT_LOCATION/helper_scripts/$PARSER" -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT" + curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | pigz -dc | $CURRENT_LOCATION/helper_scripts/$PARSER -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT" # Now, compress the different chunks CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk") @@ -552,11 +551,11 @@ create_most_tables() { log "Started sorting peptides table" - $CMD_LZ4CAT $INTDIR/peptides-out.tsv.lz4 \ + $CMD_LZ4CAT "$INTDIR/peptides-out.tsv.lz4" \ | LC_ALL=C $CMD_SORT -k2 \ - | $CMD_LZ4 > $INTDIR/peptides-equalized.tsv.lz4 + | $CMD_LZ4 > "$INTDIR/peptides-equalized.tsv.lz4" - rm $INTDIR/peptides-out.tsv.lz4 + rm "$INTDIR/peptides-out.tsv.lz4" log "Finished calculation of most tables with status $?" } @@ -572,8 +571,8 @@ number_sequences() { mkfifo "p_eq" mkfifo "p_or" - $CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 | cut -f 3 | sort | uniq > "p_or" & - $CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 | cut -f 2 | uniq > "p_eq" & + $CMD_LZ4CAT "$INTDIR/peptides-equalized.tsv.lz4" | cut -f 3 | sort | uniq > "p_or" & + $CMD_LZ4CAT "$INTDIR/peptides-equalized.tsv.lz4" | cut -f 2 | uniq > "p_eq" & sort -u -m "p_or" "p_eq" | cat -n \ | sed 's/^ *//' | $CMD_LZ4 - > "$INTDIR/sequences.tsv.lz4" @@ -587,7 +586,7 @@ substitute_aas() { have "$INTDIR/peptides-equalized.tsv.lz4" "$INTDIR/sequences.tsv.lz4" log "Started the substitution of equalized AA's by ID's for the peptides." - $CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 \ + $CMD_LZ4CAT "$INTDIR/peptides-equalized.tsv.lz4" \ | join -t ' ' -o '1.1,2.1,1.3,1.4,1.5,1.6' -1 2 -2 2 - "$(luz "$INTDIR/sequences.tsv.lz4")" \ | $CMD_LZ4 - > "$INTDIR/peptides_by_equalized.tsv.lz4" @@ -606,7 +605,7 @@ substitute_aas() { calculate_equalized_lcas() { have "$INTDIR/peptides_by_equalized.tsv.lz4" || return log "Started the calculation of equalized LCA's." - $CMD_LZ4CAT $INTDIR/peptides_by_equalized.tsv.lz4 | cut -f 2,6 \ + $CMD_LZ4CAT "$INTDIR/peptides_by_equalized.tsv.lz4" | cut -f 2,6 \ | $CURRENT_LOCATION/helper_scripts/lcas --infile "$(luz "$OUTPUT_DIR/lineages.tsv.lz4")" \ | $CMD_LZ4 - > "$INTDIR/LCAs_equalized.tsv.lz4" log "Finished the calculation of equalized LCA's (after substituting AA's by ID's) with status $?." @@ -616,7 +615,7 @@ calculate_equalized_lcas() { calculate_original_lcas() { have "$INTDIR/peptides_by_original.tsv.lz4" || return log "Started the calculation of original LCA's" - $CMD_LZ4CAT $INTDIR/peptides_by_original.tsv.lz4 | cut -f 3,6 \ + $CMD_LZ4CAT "$INTDIR/peptides_by_original.tsv.lz4" | cut -f 3,6 \ | $CURRENT_LOCATION/helper_scripts/lcas --infile "$(luz "$OUTPUT_DIR/lineages.tsv.lz4")" \ | $CMD_LZ4 - > "$INTDIR/LCAs_original.tsv.lz4" log "Finished the calculation of original LCA's (after substituting AA's by ID's) with status $?."