Skip to content

Commit

Permalink
Merge pull request #10 from unipept/feature/static-information-database
Browse files Browse the repository at this point in the history
Implement Github action for building static information database
  • Loading branch information
pverscha authored May 12, 2020
2 parents d6a013d + 7872f67 commit 4394a1f
Show file tree
Hide file tree
Showing 5 changed files with 259 additions and 1 deletion.
96 changes: 96 additions & 0 deletions .github/workflows/static_database.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
name: Database

on:
schedule:
# * is a special character in YAML so you have to quote this string
- cron: '0 0 1 * *'

jobs:
generate_static_database:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-java@v1
with:
java-version: '11'
java-package: jdk
architecture: x64
- name: Get current date
id: date
run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
- name: Install required utilities
run: |
sudo apt-get update
sudo apt-get -y install git make maven wget unzip expect gawk sqlite3 libsqlite3-dev
- name: Where is expect?
run: |
which expect
- name: Configure makefile
run: |
# The generation of umgap-data is not required for the static database
sed -i '/checkdep umgap/d' configure
sed -i '/all: makefile database index/d' makefile.in
chmod +x workflows/static_database/script.exp
chmod +x configure
./workflows/static_database/script.exp
- name: Generate tsv.gz files
shell: bash
run: |
make taxons
make functional_annotations
- name: Build SQLite database from generated files
shell: bash
run: |
chmod +x workflows/static_database/build_database.sh
./workflows/static_database/build_database.sh
- name: ls
run: ls -hl
- name: Update database versioning
shell: bash
run: |
rm workflows/static_database/version.txt
echo "${{ steps.date.outputs.date }}" > workflows/static_database/version.txt
- name: Update resources
uses: alexesprit/action-update-file@master
with:
file-path: workflows/static_database/version.txt
commit-msg: Bump db version to ${{ steps.date.outputs.date }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Get newly made commit sha
id: commit_sha
shell: bash
run: |
echo "::set-output name=sha::$(git rev-parse HEAD)"
- name: Create new tag
uses: octokit/[email protected]
id: create_new_tag
with:
route: POST /repos/:owner/:repo/git/tags
owner: unipept
repo: make-database
tag: database-${{ steps.date.outputs.date }}
message: "Static information database built on ${{ steps.date.outputs.date }}"
object: ${{ steps.commit_sha.outputs.sha }}
type: commit
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: database-${{ steps.date.outputs.date }}
release_name: Static database ${{ steps.date.outputs.date }}
draft: false
prerelease: false
- name: Upload Release Asset
id: upload-release-asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ./output.zip
asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip
asset_content_type: application/zip
8 changes: 7 additions & 1 deletion makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ taxons: \
<<<TABDIR>>>/taxons.tsv.gz \
<<<TABDIR>>>/lineages.tsv.gz

.PHONY: functional_annotations
functional_annotations: \
<<<TABDIR>>>/interpro_entries.tsv.gz \
<<<TABDIR>>>/go_terms.tsv.gz \
<<<TABDIR>>>/ec_numbers.tsv.gz

.PHONY: download
download: <<<SOURCE_FILES>>>
# }}}
Expand Down Expand Up @@ -70,7 +76,7 @@ $(JAR): $(SRC)
<<<LOGADD>>> "Finished unzipping names or nodes from the taxon dump."

<<<INTDIR>>>/clean-nodes.dmp: <<<INTDIR>>>/nodes.dmp
<<<LOGADD>>> "Starting cleaning unknown ranks form nodes."
<<<LOGADD>>> "Starting cleaning unknown ranks from nodes."
@mkdir -p $(dir $@)
<<<CMD_SED>>> < "$<" \
-e 's/subcohort/no rank/' \
Expand Down
11 changes: 11 additions & 0 deletions workflows/static_database/build_database.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Initialize the database
cat workflows/static_database/structure.sql | sqlite3 output.db

# Read all generated data into this database
zcat data/tables/ec_numbers.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin ec_numbers'
zcat data/tables/go_terms.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin go_terms'
zcat data/tables/interpro_entries.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin interpro_entries'
zcat data/tables/taxons.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin taxons'

# Compress the database before uploading it to a Github release
zip output.zip output.db
108 changes: 108 additions & 0 deletions workflows/static_database/script.exp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/expect -f
#
# This Expect script was generated by autoexpect on Tue May 12 17:00:43 2020
# Expect and autoexpect were both written by Don Libes, NIST.
#
# Note that autoexpect does not guarantee a working script. It
# necessarily has to guess about certain things. Two reasons a script
# might fail are:
#
# 1) timing - A surprising number of programs (rn, ksh, zsh, telnet,
# etc.) and devices discard or ignore keystrokes that arrive "too
# quickly" after prompts. If you find your new script hanging up at
# one spot, try adding a short sleep just before the previous send.
# Setting "force_conservative" to 1 (see below) makes Expect do this
# automatically - pausing briefly before sending each character. This
# pacifies every program I know of. The -c flag makes the script do
# this in the first place. The -C flag allows you to define a
# character to toggle this mode off and on.

set force_conservative 0 ;# set to 1 to force conservative mode even if
;# script wasn't run conservatively originally
if {$force_conservative} {
set send_slow {1 .1}
proc send {ignore arg} {
sleep .1
exp_send -s -- $arg
}
}

#
# 2) differing output - Some programs produce different output each time
# they run. The "date" command is an obvious example. Another is
# ftp, if it produces throughput statistics at the end of a file
# transfer. If this causes a problem, delete these patterns or replace
# them with wildcards. An alternative is to use the -p flag (for
# "prompt") which makes Expect only look for the last line of output
# (i.e., the prompt). The -P flag allows you to define a character to
# toggle this mode off and on.
#
# Read the man page for more info.
#
# -Don


set timeout -1
spawn ./configure
match_max 100000
expect -exact "Configuring the Unipept backend program.\r
What is the minimum length (inclusive) for tryptic peptides? \[5\] "
send -- "\r"
expect -exact "\r
What is the maximum length (inclusive) for tryptic peptides? \[50\] "
send -- "\r"
expect -exact "\r
What is the length (k) of the K-mer peptides? \[9\] "
send -- "\r"
expect -exact "\r
Where should I store the final TSV files (large, single-write)? \[./data/tables\] "
send -- "\r"
expect -exact "\r
Where should I store intermediate TSV files (large, single-write, multiple-read? \[./data/intermediate\] "
send -- "\r"
expect -exact "\r
Where should I store and extract the downloaded taxon zip (small, single-write, single-read)? \[./data/taxon\] "
send -- "\r"
expect -exact "\r
Where should I store the downloaded source xml files (large, single-write, single-read)? \[./data/sources\] "
send -- "\r"
expect -exact "\r
How much memory should Java use? \[6g\] "
send -- "1g\r"
expect -exact "1g\r
Which batch size should I use for communication with Entrez? \[1000\] "
send -- "\r"
expect -exact "\r
Which sort command should I use? \[sort --buffer-size=80% --parallel=4\] "
send -- "\r"
expect -exact "\r
Which pipe compression command should I use? \[gzip -\] "
send -- "\r"
expect -exact "\r
Which pipe decompression command (e.g. zcat, gzcat) should I use? \[zcat\] "
send -- "\r"
expect -exact "\r
How do I unzip while discarding dates? \[unzip -DD\] "
send -- "\r"
expect -exact "\r
What's my sed executable (e.g. sed, gsed)? \[sed\] "
send -- "\r"
expect -exact "\r
What's my gnu awk executable (e.g. awk, gawk)? \[awk\] "
send -- "gawk\r"
expect -exact "gawk\r
What's my gnu mktemp executable (e.g. mktemp, gmktemp)? \[mktemp\] "
send -- "\r"
expect -exact "\r
What's my gnu join executable (e.g. join, gjoin)? \[join\] "
send -- "\r"
expect -exact "\r
Parse swissprot (http://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz)? \[Y/n\] "
send -- "n\r"
expect -exact "n\r
Parse trembl (http://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz)? \[Y/n\] "
send -- "n\r"
expect -exact "n\r
Add another source by entering the name. An empty name cancels: "
send -- "\r"
expect eof
37 changes: 37 additions & 0 deletions workflows/static_database/structure.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
CREATE TABLE `go_terms` (
`id` INT NOT NULL,
`code` TEXT NOT NULL,
`namespace` TEXT NOT NULL,
`name` TEXT NOT NULL,
PRIMARY KEY (`id`)
);

CREATE UNIQUE INDEX idx_go_code ON go_terms(code);

CREATE TABLE `ec_numbers` (
`id` INT NOT NULL,
`code` TEXT NOT NULL,
`name` TEXT NOT NULL,
PRIMARY KEY (`id`)
);

CREATE UNIQUE INDEX idx_ec_code ON ec_numbers(code);

CREATE TABLE `interpro_entries` (
`id` INT NOT NULL ,
`code` TEXT NOT NULL,
`category` TEXT NOT NULL,
`name` TEXT NOT NULL,
PRIMARY KEY (`id`)
);

CREATE UNIQUE INDEX idx_ipr_code ON interpro_entries(code);

CREATE TABLE IF NOT EXISTS `taxons` (
`id` INT UNSIGNED NOT NULL ,
`name` TEXT NOT NULL ,
`rank` TEXT NULL DEFAULT NULL ,
`parent_id` INT NULL DEFAULT NULL ,
`valid_taxon` INT NOT NULL DEFAULT 1 ,
PRIMARY KEY (`id`)
);

0 comments on commit 4394a1f

Please sign in to comment.