Merge pull request #10 from unipept/feature/static-information-database

Implement Github action for building static information database
unipept · May 12, 2020 · 4394a1f · 4394a1f
2 parents d6a013d + 7872f67
commit 4394a1f
Show file tree

Hide file tree

Showing 5 changed files with 259 additions and 1 deletion.
diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml
@@ -0,0 +1,96 @@
+name: Database
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    - cron:  '0 0 1 * *'
+
+jobs:
+  generate_static_database:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-java@v1
+        with:
+          java-version: '11'
+          java-package: jdk 
+          architecture: x64 
+      - name: Get current date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+      - name: Install required utilities
+        run: |
+          sudo apt-get update
+          sudo apt-get -y install git make maven wget unzip expect gawk sqlite3 libsqlite3-dev
+      - name: Where is expect?
+        run: |
+          which expect
+      - name: Configure makefile
+        run: |
+          # The generation of umgap-data is not required for the static database
+          sed -i '/checkdep umgap/d' configure
+          sed -i '/all: makefile database index/d' makefile.in
+          chmod +x workflows/static_database/script.exp
+          chmod +x configure
+          ./workflows/static_database/script.exp
+      - name: Generate tsv.gz files
+        shell: bash
+        run: |
+          make taxons
+          make functional_annotations
+      - name: Build SQLite database from generated files
+        shell: bash
+        run: |
+          chmod +x workflows/static_database/build_database.sh
+          ./workflows/static_database/build_database.sh
+      - name: ls
+        run: ls -hl
+      - name: Update database versioning
+        shell: bash
+        run: |
+          rm workflows/static_database/version.txt
+          echo "${{ steps.date.outputs.date }}" > workflows/static_database/version.txt
+      - name: Update resources
+        uses: alexesprit/action-update-file@master
+        with:
+          file-path: workflows/static_database/version.txt
+          commit-msg: Bump db version to ${{ steps.date.outputs.date }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Get newly made commit sha
+        id: commit_sha
+        shell: bash
+        run: |
+          echo "::set-output name=sha::$(git rev-parse HEAD)"
+      - name: Create new tag
+        uses: octokit/[email protected]
+        id: create_new_tag
+        with:
+          route: POST /repos/:owner/:repo/git/tags
+          owner: unipept
+          repo: make-database
+          tag: database-${{ steps.date.outputs.date }}
+          message: "Static information database built on ${{ steps.date.outputs.date }}"
+          object: ${{ steps.commit_sha.outputs.sha }}
+          type: commit
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Create Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: database-${{ steps.date.outputs.date }}
+          release_name: Static database ${{ steps.date.outputs.date }}
+          draft: false
+          prerelease: false
+      - name: Upload Release Asset
+        id: upload-release-asset 
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.create_release.outputs.upload_url }} 
+          asset_path: ./output.zip
+          asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip
+          asset_content_type: application/zip
diff --git a/makefile.in b/makefile.in
@@ -32,6 +32,12 @@ taxons: \
 	<<<TABDIR>>>/taxons.tsv.gz \
 	<<<TABDIR>>>/lineages.tsv.gz
 
+.PHONY: functional_annotations
+functional_annotations: \
+    <<<TABDIR>>>/interpro_entries.tsv.gz \
+	<<<TABDIR>>>/go_terms.tsv.gz \
+	<<<TABDIR>>>/ec_numbers.tsv.gz
+
 .PHONY: download
 download: <<<SOURCE_FILES>>>
 # }}}
@@ -70,7 +76,7 @@ $(JAR): $(SRC)
 	<<<LOGADD>>> "Finished unzipping names or nodes from the taxon dump."
 
 <<<INTDIR>>>/clean-nodes.dmp: <<<INTDIR>>>/nodes.dmp
-	<<<LOGADD>>> "Starting cleaning unknown ranks form nodes."
+	<<<LOGADD>>> "Starting cleaning unknown ranks from nodes."
 	@mkdir -p $(dir $@)
 	<<<CMD_SED>>> < "$<" \
 		-e 's/subcohort/no rank/' \

diff --git a/workflows/static_database/build_database.sh b/workflows/static_database/build_database.sh
@@ -0,0 +1,11 @@
+# Initialize the database
+cat workflows/static_database/structure.sql | sqlite3 output.db
+
+# Read all generated data into this database
+zcat data/tables/ec_numbers.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin ec_numbers'
+zcat data/tables/go_terms.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin go_terms'
+zcat data/tables/interpro_entries.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin interpro_entries'
+zcat data/tables/taxons.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin taxons'
+
+# Compress the database before uploading it to a Github release
+zip output.zip output.db
diff --git a/workflows/static_database/script.exp b/workflows/static_database/script.exp
@@ -0,0 +1,108 @@
+#!/usr/bin/expect -f
+#
+# This Expect script was generated by autoexpect on Tue May 12 17:00:43 2020
+# Expect and autoexpect were both written by Don Libes, NIST.
+#
+# Note that autoexpect does not guarantee a working script.  It
+# necessarily has to guess about certain things.  Two reasons a script
+# might fail are:
+#
+# 1) timing - A surprising number of programs (rn, ksh, zsh, telnet,
+# etc.) and devices discard or ignore keystrokes that arrive "too
+# quickly" after prompts.  If you find your new script hanging up at
+# one spot, try adding a short sleep just before the previous send.
+# Setting "force_conservative" to 1 (see below) makes Expect do this
+# automatically - pausing briefly before sending each character.  This
+# pacifies every program I know of.  The -c flag makes the script do
+# this in the first place.  The -C flag allows you to define a
+# character to toggle this mode off and on.
+
+set force_conservative 0  ;# set to 1 to force conservative mode even if
+			  ;# script wasn't run conservatively originally
+if {$force_conservative} {
+	set send_slow {1 .1}
+	proc send {ignore arg} {
+		sleep .1
+		exp_send -s -- $arg
+	}
+}
+
+#
+# 2) differing output - Some programs produce different output each time
+# they run.  The "date" command is an obvious example.  Another is
+# ftp, if it produces throughput statistics at the end of a file
+# transfer.  If this causes a problem, delete these patterns or replace
+# them with wildcards.  An alternative is to use the -p flag (for
+# "prompt") which makes Expect only look for the last line of output
+# (i.e., the prompt).  The -P flag allows you to define a character to
+# toggle this mode off and on.
+#
+# Read the man page for more info.
+#
+# -Don
+
+
+set timeout -1
+spawn ./configure
+match_max 100000
+expect -exact "Configuring the Unipept backend program.\r
+What is the minimum length (inclusive) for tryptic peptides? \[5\] "
+send -- "\r"
+expect -exact "\r
+What is the maximum length (inclusive) for tryptic peptides? \[50\] "
+send -- "\r"
+expect -exact "\r
+What is the length (k) of the K-mer peptides? \[9\] "
+send -- "\r"
+expect -exact "\r
+Where should I store the final TSV files (large, single-write)? \[./data/tables\] "
+send -- "\r"
+expect -exact "\r
+Where should I store intermediate TSV files (large, single-write, multiple-read? \[./data/intermediate\] "
+send -- "\r"
+expect -exact "\r
+Where should I store and extract the downloaded taxon zip (small, single-write, single-read)? \[./data/taxon\] "
+send -- "\r"
+expect -exact "\r
+Where should I store the downloaded source xml files (large, single-write, single-read)? \[./data/sources\] "
+send -- "\r"
+expect -exact "\r
+How much memory should Java use? \[6g\] "
+send -- "1g\r"
+expect -exact "1g\r
+Which batch size should I use for communication with Entrez? \[1000\] "
+send -- "\r"
+expect -exact "\r
+Which sort command should I use? \[sort --buffer-size=80% --parallel=4\] "
+send -- "\r"
+expect -exact "\r
+Which pipe compression command should I use? \[gzip -\] "
+send -- "\r"
+expect -exact "\r
+Which pipe decompression command (e.g. zcat, gzcat) should I use? \[zcat\] "
+send -- "\r"
+expect -exact "\r
+How do I unzip while discarding dates? \[unzip -DD\] "
+send -- "\r"
+expect -exact "\r
+What's my sed executable (e.g. sed, gsed)? \[sed\] "
+send -- "\r"
+expect -exact "\r
+What's my gnu awk executable (e.g. awk, gawk)? \[awk\] "
+send -- "gawk\r"
+expect -exact "gawk\r
+What's my gnu mktemp executable (e.g. mktemp, gmktemp)? \[mktemp\] "
+send -- "\r"
+expect -exact "\r
+What's my gnu join executable (e.g. join, gjoin)? \[join\] "
+send -- "\r"
+expect -exact "\r
+Parse swissprot (http://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz)? \[Y/n\] "
+send -- "n\r"
+expect -exact "n\r
+Parse trembl (http://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz)? \[Y/n\] "
+send -- "n\r"
+expect -exact "n\r
+Add another source by entering the name. An empty name cancels: "
+send -- "\r"
+expect eof
diff --git a/workflows/static_database/structure.sql b/workflows/static_database/structure.sql
@@ -0,0 +1,37 @@
+CREATE TABLE `go_terms` (
+  `id` INT NOT NULL,
+  `code` TEXT NOT NULL,
+  `namespace` TEXT NOT NULL,
+  `name` TEXT NOT NULL,
+  PRIMARY KEY (`id`)
+);
+
+CREATE UNIQUE INDEX idx_go_code ON go_terms(code);
+
+CREATE TABLE `ec_numbers` (
+  `id` INT NOT NULL,
+  `code` TEXT NOT NULL,
+  `name` TEXT NOT NULL,
+  PRIMARY KEY (`id`)
+);
+
+CREATE UNIQUE INDEX idx_ec_code ON ec_numbers(code);
+
+CREATE TABLE `interpro_entries` (
+  `id` INT NOT NULL ,
+  `code` TEXT NOT NULL,
+  `category` TEXT NOT NULL,
+  `name` TEXT NOT NULL,
+  PRIMARY KEY (`id`)
+);
+
+CREATE UNIQUE INDEX idx_ipr_code ON interpro_entries(code);
+
+CREATE TABLE IF NOT EXISTS `taxons` (
+  `id` INT UNSIGNED NOT NULL ,
+  `name` TEXT NOT NULL ,
+  `rank` TEXT NULL DEFAULT NULL ,
+  `parent_id` INT NULL DEFAULT NULL ,
+  `valid_taxon` INT NOT NULL DEFAULT 1 ,
+  PRIMARY KEY (`id`)
+);