Skip to content

Commit

Permalink
Merge pull request #43 from unipept/feature/stijn-changes
Browse files Browse the repository at this point in the history
Speed up database construction pipeline
  • Loading branch information
pverscha authored Mar 14, 2024
2 parents 7aa805e + 9ea99e5 commit 9d3c735
Show file tree
Hide file tree
Showing 43 changed files with 190 additions and 1,988 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,3 @@ out
scripts/helper_scripts/parser/output
scripts/helper_scripts/parser/src/META-INF
.idea/

19 changes: 19 additions & 0 deletions scripts/build_binaries.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#! /usr/bin/env bash

# All references to an external script should be relative to the location of this script.
# See: http://mywiki.wooledge.org/BashFAQ/028
CURRENT_LOCATION="${BASH_SOURCE%/*}"

checkdep() {
which $1 > /dev/null 2>&1 || hash $1 > /dev/null 2>&1 || {
echo "Unipept database builder requires ${2:-$1} to be installed." >&2
exit 1
}
}

checkdep cargo "Rust toolchain"

# Build binaries and copy them to the /helper_scripts folder
cd $CURRENT_LOCATION/helper_scripts/unipept-database-rs
cargo build --release
find ./target/release -maxdepth 1 -type f -executable -exec cp {} .. \;
308 changes: 138 additions & 170 deletions scripts/build_database.sh

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions scripts/helper_scripts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Ignore the compiled binaries that get moved here
dat-parser
functional-analysis
lcas
taxa-by-chunk
taxons-lineages
taxons-uniprots-tables
write-to-chunk
xml-parser
73 changes: 0 additions & 73 deletions scripts/helper_scripts/FunctionalAnalysisPeptides.js

This file was deleted.

Binary file not shown.
Binary file not shown.
10 changes: 0 additions & 10 deletions scripts/helper_scripts/ParallelXmlToTab.js

This file was deleted.

50 changes: 0 additions & 50 deletions scripts/helper_scripts/TaxaByChunk.js

This file was deleted.

Binary file removed scripts/helper_scripts/TaxonsUniprots2Tables.jar
Binary file not shown.
49 changes: 0 additions & 49 deletions scripts/helper_scripts/WriteToChunk.js

This file was deleted.

Binary file removed scripts/helper_scripts/XmlToTabConverter.jar
Binary file not shown.
8 changes: 4 additions & 4 deletions scripts/helper_scripts/filter_taxa.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ mkdir -p "$TMP_DIR"

filter_taxa() {
QUERY=$(echo "\s$1\s" | sed "s/,/\\\s\\\|\\\s/g")
RESULT=$(cat "$LINEAGE_ARCHIVE" | zcat | grep "$QUERY" | cut -f1 | sort -n | uniq | tr '\n' ',')
RESULT=$(lz4 -dc "$LINEAGE_ARCHIVE" | grep "$QUERY" | cut -f1 | sort -n | uniq | tr '\n' ',')
echo "$RESULT"
}

Expand All @@ -23,16 +23,16 @@ then
TAXA=$(filter_taxa "$TAXA")

# This associative array maps a filename upon the taxa that should be queried within this file
QUERIES=( $(echo "$TAXA" | tr "," "\n" | node "$CURRENT_LOCATION/TaxaByChunk.js" "$DATABASE_INDEX" "$TMP_DIR") )
QUERIES=( $(echo "$TAXA" | tr "," "\n" | $CURRENT_LOCATION/taxa-by-chunk --chunk-dir "$DATABASE_INDEX" --temp-dir "$TMP_DIR") )

if [[ ${#QUERIES[@]} -gt 0 ]]
then
parallel --jobs 8 --max-args 2 "cat {2} | zcat | sed 's/$/$/' | grep -F -f {1} | sed 's/\$$//'" ::: "${QUERIES[@]}"
parallel --jobs 8 --max-args 2 "lz4 -dc {2} | sed 's/$/$/' | grep -F -f {1} | sed 's/\$$//'" ::: "${QUERIES[@]}"
fi
else

# If the root ID has been passed to this script, we simply print out all database items (without filtering).
find "$DATABASE_INDEX" -name "*.chunk.gz" | xargs zcat
find "$DATABASE_INDEX" -name "*.chunk.lz4" -exec lz4 -mdc {} +
fi

# Remove temporary files
Expand Down
107 changes: 0 additions & 107 deletions scripts/helper_scripts/parser/pom.xml

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Loading

0 comments on commit 9d3c735

Please sign in to comment.