Skip to content

Commit

Permalink
Update Taxdmp URL, use self-hosted if available
Browse files Browse the repository at this point in the history
  • Loading branch information
stijndcl committed Mar 8, 2024
1 parent 3c3a5a5 commit 898182f
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/static_database.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs wget
- name: Download Taxdmp file
shell: bash
run: wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
run: wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip
- name: Generate tsv.gz files
shell: bash
run: ./scripts/build_database.sh static-database "swissprot,trembl" "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz,https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz" "output"
Expand Down
20 changes: 18 additions & 2 deletions scripts/build_database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ CMD_SORT="sort --buffer-size=$SORT_MEMORY --parallel=4" # Which sort command sho
CMD_GZIP="gzip -" # Which pipe compression command should I use?
ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez?

TAXON_URL="https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"
TAXON_FALLBACK_URL="https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
EC_CLASS_URL="https://ftp.expasy.org/databases/enzyme/enzclass.txt"
EC_NUMBER_URL="https://ftp.expasy.org/databases/enzyme/enzyme.dat"
GO_TERM_URL="http://geneontology.org/ontology/go-basic.obo"
Expand Down Expand Up @@ -315,11 +315,27 @@ have() {

### All the different database construction steps.

download_taxdmp() {
# Check if our self-hosted version is available or not using the GitHub API
LATEST_RELEASE_URL="https://api.github.com/repos/unipept/unipept-database/releases/latest"
TAXDMP_RELEASE_ASSET_RE="unipept/unipept-database/releases/download/[^/]+/ncbi-taxdmp.zip"
SELF_HOSTED_URL=$(curl -s "$LATEST_RELEASE_URL" | egrep -o "$TAXDMP_RELEASE_ASSET_RE")

if [ "$SELF_HOSTED_URL" ]
then
TAXON_URL="https://github.com/$SELF_HOSTED_URL"
else
TAXON_URL="$TAXON_FALLBACK_URL"
fi

curl -L --create-dirs --silent --output "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "$TAXON_URL"
}

create_taxon_tables() {
log "Started creating the taxon tables."
reportProgress -1 "Creating taxon tables." 1

curl --create-dirs --silent --output "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "$TAXON_URL"
download_taxdmp
unzip "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "names.dmp" "nodes.dmp" -d "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT"
rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip"

Expand Down

0 comments on commit 898182f

Please sign in to comment.