diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml new file mode 100644 index 0000000..d85a088 --- /dev/null +++ b/.github/workflows/static_database.yml @@ -0,0 +1,96 @@ +name: Database + +on: + schedule: + # * is a special character in YAML so you have to quote this string + - cron: '0 0 1 * *' + +jobs: + generate_static_database: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-java@v1 + with: + java-version: '11' + java-package: jdk + architecture: x64 + - name: Get current date + id: date + run: echo "::set-output name=date::$(date +'%Y-%m-%d')" + - name: Install required utilities + run: | + sudo apt-get update + sudo apt-get -y install git make maven wget unzip expect gawk sqlite3 libsqlite3-dev + - name: Where is expect? + run: | + which expect + - name: Configure makefile + run: | + # The generation of umgap-data is not required for the static database + sed -i '/checkdep umgap/d' configure + sed -i '/all: makefile database index/d' makefile.in + chmod +x workflows/static_database/script.exp + chmod +x configure + ./workflows/static_database/script.exp + - name: Generate tsv.gz files + shell: bash + run: | + make taxons + make functional_annotations + - name: Build SQLite database from generated files + shell: bash + run: | + chmod +x workflows/static_database/build_database.sh + ./workflows/static_database/build_database.sh + - name: ls + run: ls -hl + - name: Update database versioning + shell: bash + run: | + rm workflows/static_database/version.txt + echo "${{ steps.date.outputs.date }}" > workflows/static_database/version.txt + - name: Update resources + uses: alexesprit/action-update-file@master + with: + file-path: workflows/static_database/version.txt + commit-msg: Bump db version to ${{ steps.date.outputs.date }} + github-token: ${{ secrets.GITHUB_TOKEN }} + - name: Get newly made commit sha + id: commit_sha + shell: bash + run: | + echo "::set-output name=sha::$(git rev-parse HEAD)" + - name: Create new tag + uses: octokit/request-action@v2.x + id: create_new_tag + with: + route: POST /repos/:owner/:repo/git/tags + owner: unipept + repo: make-database + tag: database-${{ steps.date.outputs.date }} + message: "Static information database built on ${{ steps.date.outputs.date }}" + object: ${{ steps.commit_sha.outputs.sha }} + type: commit + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Create Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: database-${{ steps.date.outputs.date }} + release_name: Static database ${{ steps.date.outputs.date }} + draft: false + prerelease: false + - name: Upload Release Asset + id: upload-release-asset + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.create_release.outputs.upload_url }} + asset_path: ./output.zip + asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip + asset_content_type: application/zip \ No newline at end of file diff --git a/makefile.in b/makefile.in index d490d8c..d66e63e 100644 --- a/makefile.in +++ b/makefile.in @@ -32,6 +32,12 @@ taxons: \ <<>>/taxons.tsv.gz \ <<>>/lineages.tsv.gz +.PHONY: functional_annotations +functional_annotations: \ + <<>>/interpro_entries.tsv.gz \ + <<>>/go_terms.tsv.gz \ + <<>>/ec_numbers.tsv.gz + .PHONY: download download: <<>> # }}} @@ -70,7 +76,7 @@ $(JAR): $(SRC) <<>> "Finished unzipping names or nodes from the taxon dump." <<>>/clean-nodes.dmp: <<>>/nodes.dmp - <<>> "Starting cleaning unknown ranks form nodes." + <<>> "Starting cleaning unknown ranks from nodes." @mkdir -p $(dir $@) <<>> < "$<" \ -e 's/subcohort/no rank/' \ diff --git a/workflows/static_database/build_database.sh b/workflows/static_database/build_database.sh new file mode 100644 index 0000000..a11d71c --- /dev/null +++ b/workflows/static_database/build_database.sh @@ -0,0 +1,11 @@ +# Initialize the database +cat workflows/static_database/structure.sql | sqlite3 output.db + +# Read all generated data into this database +zcat data/tables/ec_numbers.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin ec_numbers' +zcat data/tables/go_terms.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin go_terms' +zcat data/tables/interpro_entries.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin interpro_entries' +zcat data/tables/taxons.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin taxons' + +# Compress the database before uploading it to a Github release +zip output.zip output.db diff --git a/workflows/static_database/script.exp b/workflows/static_database/script.exp new file mode 100644 index 0000000..9b40e00 --- /dev/null +++ b/workflows/static_database/script.exp @@ -0,0 +1,108 @@ +#!/usr/bin/expect -f +# +# This Expect script was generated by autoexpect on Tue May 12 17:00:43 2020 +# Expect and autoexpect were both written by Don Libes, NIST. +# +# Note that autoexpect does not guarantee a working script. It +# necessarily has to guess about certain things. Two reasons a script +# might fail are: +# +# 1) timing - A surprising number of programs (rn, ksh, zsh, telnet, +# etc.) and devices discard or ignore keystrokes that arrive "too +# quickly" after prompts. If you find your new script hanging up at +# one spot, try adding a short sleep just before the previous send. +# Setting "force_conservative" to 1 (see below) makes Expect do this +# automatically - pausing briefly before sending each character. This +# pacifies every program I know of. The -c flag makes the script do +# this in the first place. The -C flag allows you to define a +# character to toggle this mode off and on. + +set force_conservative 0 ;# set to 1 to force conservative mode even if + ;# script wasn't run conservatively originally +if {$force_conservative} { + set send_slow {1 .1} + proc send {ignore arg} { + sleep .1 + exp_send -s -- $arg + } +} + +# +# 2) differing output - Some programs produce different output each time +# they run. The "date" command is an obvious example. Another is +# ftp, if it produces throughput statistics at the end of a file +# transfer. If this causes a problem, delete these patterns or replace +# them with wildcards. An alternative is to use the -p flag (for +# "prompt") which makes Expect only look for the last line of output +# (i.e., the prompt). The -P flag allows you to define a character to +# toggle this mode off and on. +# +# Read the man page for more info. +# +# -Don + + +set timeout -1 +spawn ./configure +match_max 100000 +expect -exact "Configuring the Unipept backend program.\r +What is the minimum length (inclusive) for tryptic peptides? \[5\] " +send -- "\r" +expect -exact "\r +What is the maximum length (inclusive) for tryptic peptides? \[50\] " +send -- "\r" +expect -exact "\r +What is the length (k) of the K-mer peptides? \[9\] " +send -- "\r" +expect -exact "\r +Where should I store the final TSV files (large, single-write)? \[./data/tables\] " +send -- "\r" +expect -exact "\r +Where should I store intermediate TSV files (large, single-write, multiple-read? \[./data/intermediate\] " +send -- "\r" +expect -exact "\r +Where should I store and extract the downloaded taxon zip (small, single-write, single-read)? \[./data/taxon\] " +send -- "\r" +expect -exact "\r +Where should I store the downloaded source xml files (large, single-write, single-read)? \[./data/sources\] " +send -- "\r" +expect -exact "\r +How much memory should Java use? \[6g\] " +send -- "1g\r" +expect -exact "1g\r +Which batch size should I use for communication with Entrez? \[1000\] " +send -- "\r" +expect -exact "\r +Which sort command should I use? \[sort --buffer-size=80% --parallel=4\] " +send -- "\r" +expect -exact "\r +Which pipe compression command should I use? \[gzip -\] " +send -- "\r" +expect -exact "\r +Which pipe decompression command (e.g. zcat, gzcat) should I use? \[zcat\] " +send -- "\r" +expect -exact "\r +How do I unzip while discarding dates? \[unzip -DD\] " +send -- "\r" +expect -exact "\r +What's my sed executable (e.g. sed, gsed)? \[sed\] " +send -- "\r" +expect -exact "\r +What's my gnu awk executable (e.g. awk, gawk)? \[awk\] " +send -- "gawk\r" +expect -exact "gawk\r +What's my gnu mktemp executable (e.g. mktemp, gmktemp)? \[mktemp\] " +send -- "\r" +expect -exact "\r +What's my gnu join executable (e.g. join, gjoin)? \[join\] " +send -- "\r" +expect -exact "\r +Parse swissprot (http://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz)? \[Y/n\] " +send -- "n\r" +expect -exact "n\r +Parse trembl (http://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz)? \[Y/n\] " +send -- "n\r" +expect -exact "n\r +Add another source by entering the name. An empty name cancels: " +send -- "\r" +expect eof diff --git a/workflows/static_database/structure.sql b/workflows/static_database/structure.sql new file mode 100644 index 0000000..2729edb --- /dev/null +++ b/workflows/static_database/structure.sql @@ -0,0 +1,37 @@ +CREATE TABLE `go_terms` ( + `id` INT NOT NULL, + `code` TEXT NOT NULL, + `namespace` TEXT NOT NULL, + `name` TEXT NOT NULL, + PRIMARY KEY (`id`) +); + +CREATE UNIQUE INDEX idx_go_code ON go_terms(code); + +CREATE TABLE `ec_numbers` ( + `id` INT NOT NULL, + `code` TEXT NOT NULL, + `name` TEXT NOT NULL, + PRIMARY KEY (`id`) +); + +CREATE UNIQUE INDEX idx_ec_code ON ec_numbers(code); + +CREATE TABLE `interpro_entries` ( + `id` INT NOT NULL , + `code` TEXT NOT NULL, + `category` TEXT NOT NULL, + `name` TEXT NOT NULL, + PRIMARY KEY (`id`) +); + +CREATE UNIQUE INDEX idx_ipr_code ON interpro_entries(code); + +CREATE TABLE IF NOT EXISTS `taxons` ( + `id` INT UNSIGNED NOT NULL , + `name` TEXT NOT NULL , + `rank` TEXT NULL DEFAULT NULL , + `parent_id` INT NULL DEFAULT NULL , + `valid_taxon` INT NOT NULL DEFAULT 1 , + PRIMARY KEY (`id`) +);