-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from unipept/feature/static-information-database
Implement Github action for building static information database
- Loading branch information
Showing
5 changed files
with
259 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
name: Database | ||
|
||
on: | ||
schedule: | ||
# * is a special character in YAML so you have to quote this string | ||
- cron: '0 0 1 * *' | ||
|
||
jobs: | ||
generate_static_database: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v2 | ||
- uses: actions/setup-java@v1 | ||
with: | ||
java-version: '11' | ||
java-package: jdk | ||
architecture: x64 | ||
- name: Get current date | ||
id: date | ||
run: echo "::set-output name=date::$(date +'%Y-%m-%d')" | ||
- name: Install required utilities | ||
run: | | ||
sudo apt-get update | ||
sudo apt-get -y install git make maven wget unzip expect gawk sqlite3 libsqlite3-dev | ||
- name: Where is expect? | ||
run: | | ||
which expect | ||
- name: Configure makefile | ||
run: | | ||
# The generation of umgap-data is not required for the static database | ||
sed -i '/checkdep umgap/d' configure | ||
sed -i '/all: makefile database index/d' makefile.in | ||
chmod +x workflows/static_database/script.exp | ||
chmod +x configure | ||
./workflows/static_database/script.exp | ||
- name: Generate tsv.gz files | ||
shell: bash | ||
run: | | ||
make taxons | ||
make functional_annotations | ||
- name: Build SQLite database from generated files | ||
shell: bash | ||
run: | | ||
chmod +x workflows/static_database/build_database.sh | ||
./workflows/static_database/build_database.sh | ||
- name: ls | ||
run: ls -hl | ||
- name: Update database versioning | ||
shell: bash | ||
run: | | ||
rm workflows/static_database/version.txt | ||
echo "${{ steps.date.outputs.date }}" > workflows/static_database/version.txt | ||
- name: Update resources | ||
uses: alexesprit/action-update-file@master | ||
with: | ||
file-path: workflows/static_database/version.txt | ||
commit-msg: Bump db version to ${{ steps.date.outputs.date }} | ||
github-token: ${{ secrets.GITHUB_TOKEN }} | ||
- name: Get newly made commit sha | ||
id: commit_sha | ||
shell: bash | ||
run: | | ||
echo "::set-output name=sha::$(git rev-parse HEAD)" | ||
- name: Create new tag | ||
uses: octokit/[email protected] | ||
id: create_new_tag | ||
with: | ||
route: POST /repos/:owner/:repo/git/tags | ||
owner: unipept | ||
repo: make-database | ||
tag: database-${{ steps.date.outputs.date }} | ||
message: "Static information database built on ${{ steps.date.outputs.date }}" | ||
object: ${{ steps.commit_sha.outputs.sha }} | ||
type: commit | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
- name: Create Release | ||
id: create_release | ||
uses: actions/create-release@v1 | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
with: | ||
tag_name: database-${{ steps.date.outputs.date }} | ||
release_name: Static database ${{ steps.date.outputs.date }} | ||
draft: false | ||
prerelease: false | ||
- name: Upload Release Asset | ||
id: upload-release-asset | ||
uses: actions/upload-release-asset@v1 | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
with: | ||
upload_url: ${{ steps.create_release.outputs.upload_url }} | ||
asset_path: ./output.zip | ||
asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip | ||
asset_content_type: application/zip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Initialize the database | ||
cat workflows/static_database/structure.sql | sqlite3 output.db | ||
|
||
# Read all generated data into this database | ||
zcat data/tables/ec_numbers.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin ec_numbers' | ||
zcat data/tables/go_terms.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin go_terms' | ||
zcat data/tables/interpro_entries.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin interpro_entries' | ||
zcat data/tables/taxons.tsv.gz | sed "s/\t/$/g" | sqlite3 -csv -separator "$" output.db '.import /dev/stdin taxons' | ||
|
||
# Compress the database before uploading it to a Github release | ||
zip output.zip output.db |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
#!/usr/bin/expect -f | ||
# | ||
# This Expect script was generated by autoexpect on Tue May 12 17:00:43 2020 | ||
# Expect and autoexpect were both written by Don Libes, NIST. | ||
# | ||
# Note that autoexpect does not guarantee a working script. It | ||
# necessarily has to guess about certain things. Two reasons a script | ||
# might fail are: | ||
# | ||
# 1) timing - A surprising number of programs (rn, ksh, zsh, telnet, | ||
# etc.) and devices discard or ignore keystrokes that arrive "too | ||
# quickly" after prompts. If you find your new script hanging up at | ||
# one spot, try adding a short sleep just before the previous send. | ||
# Setting "force_conservative" to 1 (see below) makes Expect do this | ||
# automatically - pausing briefly before sending each character. This | ||
# pacifies every program I know of. The -c flag makes the script do | ||
# this in the first place. The -C flag allows you to define a | ||
# character to toggle this mode off and on. | ||
|
||
set force_conservative 0 ;# set to 1 to force conservative mode even if | ||
;# script wasn't run conservatively originally | ||
if {$force_conservative} { | ||
set send_slow {1 .1} | ||
proc send {ignore arg} { | ||
sleep .1 | ||
exp_send -s -- $arg | ||
} | ||
} | ||
|
||
# | ||
# 2) differing output - Some programs produce different output each time | ||
# they run. The "date" command is an obvious example. Another is | ||
# ftp, if it produces throughput statistics at the end of a file | ||
# transfer. If this causes a problem, delete these patterns or replace | ||
# them with wildcards. An alternative is to use the -p flag (for | ||
# "prompt") which makes Expect only look for the last line of output | ||
# (i.e., the prompt). The -P flag allows you to define a character to | ||
# toggle this mode off and on. | ||
# | ||
# Read the man page for more info. | ||
# | ||
# -Don | ||
|
||
|
||
set timeout -1 | ||
spawn ./configure | ||
match_max 100000 | ||
expect -exact "Configuring the Unipept backend program.\r | ||
What is the minimum length (inclusive) for tryptic peptides? \[5\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
What is the maximum length (inclusive) for tryptic peptides? \[50\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
What is the length (k) of the K-mer peptides? \[9\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
Where should I store the final TSV files (large, single-write)? \[./data/tables\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
Where should I store intermediate TSV files (large, single-write, multiple-read? \[./data/intermediate\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
Where should I store and extract the downloaded taxon zip (small, single-write, single-read)? \[./data/taxon\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
Where should I store the downloaded source xml files (large, single-write, single-read)? \[./data/sources\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
How much memory should Java use? \[6g\] " | ||
send -- "1g\r" | ||
expect -exact "1g\r | ||
Which batch size should I use for communication with Entrez? \[1000\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
Which sort command should I use? \[sort --buffer-size=80% --parallel=4\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
Which pipe compression command should I use? \[gzip -\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
Which pipe decompression command (e.g. zcat, gzcat) should I use? \[zcat\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
How do I unzip while discarding dates? \[unzip -DD\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
What's my sed executable (e.g. sed, gsed)? \[sed\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
What's my gnu awk executable (e.g. awk, gawk)? \[awk\] " | ||
send -- "gawk\r" | ||
expect -exact "gawk\r | ||
What's my gnu mktemp executable (e.g. mktemp, gmktemp)? \[mktemp\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
What's my gnu join executable (e.g. join, gjoin)? \[join\] " | ||
send -- "\r" | ||
expect -exact "\r | ||
Parse swissprot (http://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz)? \[Y/n\] " | ||
send -- "n\r" | ||
expect -exact "n\r | ||
Parse trembl (http://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz)? \[Y/n\] " | ||
send -- "n\r" | ||
expect -exact "n\r | ||
Add another source by entering the name. An empty name cancels: " | ||
send -- "\r" | ||
expect eof |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
CREATE TABLE `go_terms` ( | ||
`id` INT NOT NULL, | ||
`code` TEXT NOT NULL, | ||
`namespace` TEXT NOT NULL, | ||
`name` TEXT NOT NULL, | ||
PRIMARY KEY (`id`) | ||
); | ||
|
||
CREATE UNIQUE INDEX idx_go_code ON go_terms(code); | ||
|
||
CREATE TABLE `ec_numbers` ( | ||
`id` INT NOT NULL, | ||
`code` TEXT NOT NULL, | ||
`name` TEXT NOT NULL, | ||
PRIMARY KEY (`id`) | ||
); | ||
|
||
CREATE UNIQUE INDEX idx_ec_code ON ec_numbers(code); | ||
|
||
CREATE TABLE `interpro_entries` ( | ||
`id` INT NOT NULL , | ||
`code` TEXT NOT NULL, | ||
`category` TEXT NOT NULL, | ||
`name` TEXT NOT NULL, | ||
PRIMARY KEY (`id`) | ||
); | ||
|
||
CREATE UNIQUE INDEX idx_ipr_code ON interpro_entries(code); | ||
|
||
CREATE TABLE IF NOT EXISTS `taxons` ( | ||
`id` INT UNSIGNED NOT NULL , | ||
`name` TEXT NOT NULL , | ||
`rank` TEXT NULL DEFAULT NULL , | ||
`parent_id` INT NULL DEFAULT NULL , | ||
`valid_taxon` INT NOT NULL DEFAULT 1 , | ||
PRIMARY KEY (`id`) | ||
); |