Skip to content

Commit

Permalink
Merge pull request #41 from stijndcl/feature/download-taxdmp-ci
Browse files Browse the repository at this point in the history
Download NCBI Taxdmp file in CI
  • Loading branch information
pverscha authored Mar 14, 2024
2 parents 90eb5f4 + 898182f commit 7aa805e
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 5 deletions.
19 changes: 16 additions & 3 deletions .github/workflows/static_database.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ jobs:
- name: Install required utilities
run: |
sudo apt-get update
sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs
sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs wget
- name: Download Taxdmp file
shell: bash
run: wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip
- name: Generate tsv.gz files
shell: bash
run: ./scripts/build_database.sh static-database "swissprot,trembl" "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz,https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz" "output"
Expand Down Expand Up @@ -60,8 +63,8 @@ jobs:
release_name: Static database ${{ steps.date.outputs.date }}
draft: false
prerelease: false
- name: Upload Release Asset
id: upload-release-asset
- name: Upload Static Database Release Asset
id: upload-database-release-asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand All @@ -70,3 +73,13 @@ jobs:
asset_path: ./output.zip
asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip
asset_content_type: application/zip
- name: Upload NCBI Taxdmp Release Asset
id: upload-taxdmp-release-asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ./taxdmp.zip
asset_name: ncbi-taxdmp.zip
asset_content_type: application/zip
20 changes: 18 additions & 2 deletions scripts/build_database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ CMD_SORT="sort --buffer-size=$SORT_MEMORY --parallel=4" # Which sort command sho
CMD_GZIP="gzip -" # Which pipe compression command should I use?
ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez?

TAXON_URL="https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"
TAXON_FALLBACK_URL="https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
EC_CLASS_URL="https://ftp.expasy.org/databases/enzyme/enzclass.txt"
EC_NUMBER_URL="https://ftp.expasy.org/databases/enzyme/enzyme.dat"
GO_TERM_URL="http://geneontology.org/ontology/go-basic.obo"
Expand Down Expand Up @@ -315,11 +315,27 @@ have() {

### All the different database construction steps.

download_taxdmp() {
# Check if our self-hosted version is available or not using the GitHub API
LATEST_RELEASE_URL="https://api.github.com/repos/unipept/unipept-database/releases/latest"
TAXDMP_RELEASE_ASSET_RE="unipept/unipept-database/releases/download/[^/]+/ncbi-taxdmp.zip"
SELF_HOSTED_URL=$(curl -s "$LATEST_RELEASE_URL" | egrep -o "$TAXDMP_RELEASE_ASSET_RE")

if [ "$SELF_HOSTED_URL" ]
then
TAXON_URL="https://github.com/$SELF_HOSTED_URL"
else
TAXON_URL="$TAXON_FALLBACK_URL"
fi

curl -L --create-dirs --silent --output "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "$TAXON_URL"
}

create_taxon_tables() {
log "Started creating the taxon tables."
reportProgress -1 "Creating taxon tables." 1

curl --create-dirs --silent --output "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "$TAXON_URL"
download_taxdmp
unzip "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "names.dmp" "nodes.dmp" -d "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT"
rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip"

Expand Down

0 comments on commit 7aa805e

Please sign in to comment.