Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically switch between XML and DAT parser for UniProtKB #45

Merged
merged 3 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 28 additions & 14 deletions scripts/build_database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ Required parameters:

* DB_SOURCES: List of UniProt source URLs. The items in this list should be delimited by comma's. Commonly used
databases and their corresponding sources are:
- swissprot: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
- trembl: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz
- swissprot: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
- trembl: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.dat.gz

* OUTPUT_DIR: Directory in which the tsv.lz4-files that are produced by this script will be stored.

Expand Down Expand Up @@ -99,6 +99,10 @@ terminateAndExit() {
# an error has occurred and will properly exit the script.
errorAndExit() {
echo "Error: the script experienced an error while trying to build the requested database." 1>&2
if [[ -n "$1" ]]
then
echo "Error details: $1" 1>&2
fi
echo "" 1>&2
clean
exit 2
Expand Down Expand Up @@ -256,7 +260,6 @@ INTDIR="$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT" # Where should I store intermediate TS
KMER_LENGTH=9 # What is the length (k) of the K-mer peptides?
CMD_SORT="sort --buffer-size=$SORT_MEMORY --parallel=4" # Which sort command should I use?
CMD_GZIP="pigz -" # Which pipe compression command should I use for .gz files?
CMD_ZCAT="pigz -dc" # Which decompression command should I use for .gz files?
CMD_LZ4="lz4 -c" # Which pipe compression command should I use for .lz4 files?
CMD_LZ4CAT="lz4 -dc" # Which decompression command should I use for .lz4 files?
ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez?
Expand Down Expand Up @@ -299,7 +302,7 @@ guz() {
fifo="$(uuidgen)-$(basename "$1")"
mkfifo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo"
echo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo"
{ $CMD_ZCAT "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null &
{ pigz -dc "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null &
}

lz() {
Expand Down Expand Up @@ -402,6 +405,17 @@ download_and_convert_all_sources() {

mkdir -p "$DB_INDEX_OUTPUT"

# The parser that should be used, depends on the filetype of the database that's been provided to this script.
if [[ $DB_SOURCE == *xml.gz ]]
then
PARSER="xml-parser"
elif [[ $DB_SOURCE == *dat.gz ]]
then
PARSER="dat-parser"
else
errorAndExit "No known parser available for provided UniProtKB file format. Only XML and DAT are available."
fi

# No ETags or other header requests are available if a database is requested from the UniProt REST API. That's why
# we always need to reprocess the database in that case.
if [[ $DB_SOURCE =~ "rest" ]]
Expand All @@ -414,7 +428,7 @@ download_and_convert_all_sources() {

reportProgress -1 "Downloading database index for $DB_TYPE." 3

curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | $CURRENT_LOCATION/helper_scripts/xml-parser -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | pigz -dc | $CURRENT_LOCATION/helper_scripts/$PARSER -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"

# Now, compress the different chunks
CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk")
Expand Down Expand Up @@ -458,7 +472,7 @@ download_and_convert_all_sources() {

SIZE="$(curl -I "$DB_SOURCE" -s | grep -i content-length | tr -cd '[0-9]')"

curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | $CURRENT_LOCATION/helper_scripts/xml-parser -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | pigz -dc | $CURRENT_LOCATION/helper_scripts/$PARSER -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"

# Now, compress the different chunks
CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk")
Expand Down Expand Up @@ -537,11 +551,11 @@ create_most_tables() {

log "Started sorting peptides table"

$CMD_LZ4CAT $INTDIR/peptides-out.tsv.lz4 \
$CMD_LZ4CAT "$INTDIR/peptides-out.tsv.lz4" \
| LC_ALL=C $CMD_SORT -k2 \
| $CMD_LZ4 > $INTDIR/peptides-equalized.tsv.lz4
| $CMD_LZ4 > "$INTDIR/peptides-equalized.tsv.lz4"

rm $INTDIR/peptides-out.tsv.lz4
rm "$INTDIR/peptides-out.tsv.lz4"
log "Finished calculation of most tables with status $?"
}

Expand All @@ -557,8 +571,8 @@ number_sequences() {
mkfifo "p_eq"
mkfifo "p_or"

$CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 | cut -f 3 | sort | uniq > "p_or" &
$CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 | cut -f 2 | uniq > "p_eq" &
$CMD_LZ4CAT "$INTDIR/peptides-equalized.tsv.lz4" | cut -f 3 | sort | uniq > "p_or" &
$CMD_LZ4CAT "$INTDIR/peptides-equalized.tsv.lz4" | cut -f 2 | uniq > "p_eq" &

sort -u -m "p_or" "p_eq" | cat -n \
| sed 's/^ *//' | $CMD_LZ4 - > "$INTDIR/sequences.tsv.lz4"
Expand All @@ -572,7 +586,7 @@ substitute_aas() {
have "$INTDIR/peptides-equalized.tsv.lz4" "$INTDIR/sequences.tsv.lz4"

log "Started the substitution of equalized AA's by ID's for the peptides."
$CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 \
$CMD_LZ4CAT "$INTDIR/peptides-equalized.tsv.lz4" \
| join -t ' ' -o '1.1,2.1,1.3,1.4,1.5,1.6' -1 2 -2 2 - "$(luz "$INTDIR/sequences.tsv.lz4")" \
| $CMD_LZ4 - > "$INTDIR/peptides_by_equalized.tsv.lz4"

Expand All @@ -591,7 +605,7 @@ substitute_aas() {
calculate_equalized_lcas() {
have "$INTDIR/peptides_by_equalized.tsv.lz4" || return
log "Started the calculation of equalized LCA's."
$CMD_LZ4CAT $INTDIR/peptides_by_equalized.tsv.lz4 | cut -f 2,6 \
$CMD_LZ4CAT "$INTDIR/peptides_by_equalized.tsv.lz4" | cut -f 2,6 \
| $CURRENT_LOCATION/helper_scripts/lcas --infile "$(luz "$OUTPUT_DIR/lineages.tsv.lz4")" \
| $CMD_LZ4 - > "$INTDIR/LCAs_equalized.tsv.lz4"
log "Finished the calculation of equalized LCA's (after substituting AA's by ID's) with status $?."
Expand All @@ -601,7 +615,7 @@ calculate_equalized_lcas() {
calculate_original_lcas() {
have "$INTDIR/peptides_by_original.tsv.lz4" || return
log "Started the calculation of original LCA's"
$CMD_LZ4CAT $INTDIR/peptides_by_original.tsv.lz4 | cut -f 3,6 \
$CMD_LZ4CAT "$INTDIR/peptides_by_original.tsv.lz4" | cut -f 3,6 \
| $CURRENT_LOCATION/helper_scripts/lcas --infile "$(luz "$OUTPUT_DIR/lineages.tsv.lz4")" \
| $CMD_LZ4 - > "$INTDIR/LCAs_original.tsv.lz4"
log "Finished the calculation of original LCA's (after substituting AA's by ID's) with status $?."
Expand Down
4 changes: 2 additions & 2 deletions scripts/load.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ function print {

cd "$dir"

for file in *.tsv.gz
for file in *.tsv.lz4
do
print $file
tbl=`echo $file | sed "s/.tsv.gz//"`
tbl=`echo $file | sed "s/.tsv.lz4//"`
echo "zcatting - LOAD DATA LOCAL INFILE '$file' INTO TABLE $tbl"
zcat $file | mariadb --local-infile=1 -uunipept -punipept $db -e "LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE $tbl;SHOW WARNINGS" 2>&1
done
Expand Down
Loading