From a4177ba3b1b4f94737e7e4fa4c8bebbca70a4b38 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Thu, 14 Mar 2024 09:43:11 +0100 Subject: [PATCH] Update static database action according to migration to Rust and LZ4 --- .github/workflows/static_database.yml | 25 ++++++++++++++----------- scripts/build_database.sh | 3 +++ 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index 7d76be0..5218896 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -10,19 +10,22 @@ jobs: generate_static_database: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-java@v1 + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + id: cache with: - java-version: '11' - java-package: jdk - architecture: x64 + shared-key: ${{ env.CACHE_KEY }} + - run: cd scripts/helper_scripts/unipept-database-rs && cargo fetch + if: ${{ !steps.cache.outputs.cache-hit }} + - run: ./scripts/build_binaries.sh - name: Get current date id: date run: echo "::set-output name=date::$(date +'%Y-%m-%d')" - name: Install required utilities run: | sudo apt-get update - sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs wget + sudo apt-get -y install git curl unzip gawk sqlite3 libsqlite3-dev pv nodejs wget uuid-runtime pigz lz4 parallel - name: Download Taxdmp file shell: bash run: wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip @@ -36,11 +39,11 @@ jobs: sqlite3 output.db < workflows/static_database/structure.sql # Read all generated data into this database - zcat output/ec_numbers.tsv.gz | sqlite3 -csv -separator ' ' output.db '.import /dev/stdin ec_numbers' - zcat output/go_terms.tsv.gz | sqlite3 -csv -separator ' ' output.db '.import /dev/stdin go_terms' - zcat output/interpro_entries.tsv.gz | sqlite3 -csv -separator ' ' output.db '.import /dev/stdin interpro_entries' - zcat output/taxons.tsv.gz | sqlite3 -csv -separator ' ' output.db '.import /dev/stdin taxons' - zcat output/lineages.tsv.gz | sqlite3 -csv -separator ' ' output.db '.import /dev/stdin lineages' + lz4cat output/ec_numbers.tsv.lz4 | sqlite3 -csv -separator ' ' output.db '.import /dev/stdin ec_numbers' + lz4cat output/go_terms.tsv.lz4 | sqlite3 -csv -separator ' ' output.db '.import /dev/stdin go_terms' + lz4cat output/interpro_entries.tsv.lz4 | sqlite3 -csv -separator ' ' output.db '.import /dev/stdin interpro_entries' + lz4cat output/taxons.tsv.lz4 | sqlite3 -csv -separator ' ' output.db '.import /dev/stdin taxons' + lz4cat output/lineages.tsv.lz4 | sqlite3 -csv -separator ' ' output.db '.import /dev/stdin lineages' # Create virtual tables sqlite3 output.db < workflows/static_database/init_virtual_tables.sql diff --git a/scripts/build_database.sh b/scripts/build_database.sh index 08f4015..3cf903e 100755 --- a/scripts/build_database.sh +++ b/scripts/build_database.sh @@ -332,7 +332,10 @@ download_taxdmp() { # Check if our self-hosted version is available or not using the GitHub API LATEST_RELEASE_URL="https://api.github.com/repos/unipept/unipept-database/releases/latest" TAXDMP_RELEASE_ASSET_RE="unipept/unipept-database/releases/download/[^/]+/ncbi-taxdmp.zip" + # Temporary disable the pipefail check (cause egrep can exit with code 1 if nothing is found). + set +eo pipefail SELF_HOSTED_URL=$(curl -s "$LATEST_RELEASE_URL" | egrep -o "$TAXDMP_RELEASE_ASSET_RE") + set -eo pipefail if [ "$SELF_HOSTED_URL" ] then