From 22f62a7dc1b15d30020cbad550ab8ca95f497092 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Mon, 12 Aug 2024 16:33:47 +0200 Subject: [PATCH] A lot of tweaking and improving of the unipept-index feature --- unipept-index/install.sh | 97 ++++++++++++++++--- .../~/db_schemas/structure_index_only.sql | 13 +++ .../~/db_schemas/structure_no_index.sql | 30 ++++++ 3 files changed, 129 insertions(+), 11 deletions(-) create mode 100644 unipept-index/~/db_schemas/structure_index_only.sql create mode 100644 unipept-index/~/db_schemas/structure_no_index.sql diff --git a/unipept-index/install.sh b/unipept-index/install.sh index 6dbe29d..64f0b19 100755 --- a/unipept-index/install.sh +++ b/unipept-index/install.sh @@ -16,28 +16,27 @@ trap 'error_exit "An unexpected error occurred."' ERR # Define variables FEATURE_DIR="/unipept-index-data" VERSION_OPTION="${VERSION:-latest}" -GITHUB_API="https://api.github.com/repos/unipept/unipept-index/releases" +GITHUB_API_INDEX="https://api.github.com/repos/unipept/unipept-index/releases" +GITHUB_API_DATABASE="https://api.github.com/repos/unipept/unipept-database/releases" # Create the feature directory if it doesn't exist mkdir -p "$FEATURE_DIR" # Function to get releases from GitHub get_releases() { - curl -s "$GITHUB_API" | jq -r '.[] | .tag_name' || error_exit "Failed to retrieve releases from GitHub." + curl -s "$GITHUB_API_INDEX" | jq -r '.[] | .tag_name' || error_exit "Failed to retrieve releases from GitHub." } # Function to download and extract the specified version download_and_extract() { - local version=$1 + local version="$1" + local zip_file_name="$2" + local github_url="$3" local release_url - local zip_file_name local zip_file - # Construct the expected ZIP file name based on the version date - zip_file_name="index_SP_${version}.zip" - # Get the release URL for the specific ZIP file - release_url=$(curl -s "$GITHUB_API" | jq -r --arg zip_name "$zip_file_name" '.[] | .assets[] | select(.name == $zip_name) | .browser_download_url') + release_url=$(curl -s "$github_url" | jq -r --arg zip_name "$zip_file_name" --arg date "$version" '.[] | .assets[] | select(.created_at | contains($date)) | select(.name == $zip_name) | .browser_download_url') # Check if release URL is found if [ -z "$release_url" ]; then @@ -89,17 +88,93 @@ download_version() { # Extract the date part from the latest version latest_version_date=${latest_version#*SP_}; latest_version_date=${latest_version_date%.zip} - download_and_extract "$latest_version_date" + # First, download the zip file containing the index files + download_and_extract "$latest_version_date" "index_SP_${latest_version_date}.zip" "$GITHUB_API_INDEX" + # Then, download the zip file containing the database files + download_and_extract "$latest_version_date" "suffix-array.zip" "$GITHUB_API_DATABASE" echo "Successfully downloaded and extracted the latest version: $latest_version_date" else # Attempt to download the specified version - download_and_extract "$VERSION_OPTION" || { + download_and_extract "$VERSION_OPTION" "index_SP_${VERSION_OPTION}.zip" "$GITHUB_API_INDEX" || { echo "No release available for the specified date: $VERSION_OPTION" list_last_10_releases exit 1 } + download_and_extract "$VERSION_OPTION" "suffix-array.zip" "$GITHUB_API_DATABASE" + echo "Successfully downloaded and extracted version: $VERSION_OPTION" fi } -# Call the function to download the specified version +DB_TMP_DIR="~/db_schemas/" +DB_USER="root" +DB_PASSWORD="root_pass" + +# We also need to install and setup a small MySQL database that requires the UniProt-entries to be loaded in before- +# hand. There's will be used by the Unipept API to retrieve functional annotations and other metadata. +setup_database() { + echo "Started constructing database..." + + # First, download the database schemas that are required for the suffix array + mkdir -p "$DB_TMP_DIR" + + # Download the database schema + wget -q "https://raw.githubusercontent.com/unipept/unipept-database/master/schemas_suffix_array/structure_no_index.sql" -O "$DB_TMP_DIR/structure_no_index.sql" + # Download an SQL-file that starts building indices for the database + wget -q "https://raw.githubusercontent.com/unipept/unipept-database/master/schemas_suffix_array/structure_index_only.sql" -O "$DB_TMP_DIR/structure_index_only.sql" + + # Install mariadb-server from apt without user interaction + export DEBIAN_FRONTEND="noninteractive" + sudo debconf-set-selections <<< "mariadb-server mysql-server/root_password password $DB_PASSWORD" + sudo debconf-set-selections <<< "mariadb-server mysql-server/root_password_again password $DB_PASSWORD" + + apt update && apt install -y lz4 mariadb-server + + # Start MariaDB service + service mariadb start + + # Import the SQL files into the database + mysql -uroot -p"$DB_PASSWORD" < "$DB_TMP_DIR/structure_no_index.sql" + + # Load the UniProt-entries into the database + lz4 -dcfm "$FEATURE_DIR/uniprot_entries.tsv.lz4" | mariadb --local-infile=1 -uroot -p"$DB_PASSWORD" unipept -e "LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE uniprot_entries;SHOW WARNINGS" 2>&1 + + # Build the database indices + mysql -uroot -p"$DB_PASSWORD" unipept < "$DB_TMP_DIR/structure_index_only.sql" + + echo "Constructing database finished..." +} + +# Correctly move and extract the files required for the datastore used by the Unipept API. +initialize_datastore() { + mkdir -p "$FEATURE_DIR/datastore" + + # Iterate over each .lz4 file in the source directory + for lz4_file in "$FEATURE_DIR"/*.tsv.lz4; do + # Check if the file exists (in case no .lz4 files are present) + if [[ -f "$lz4_file" ]]; then + # Extract the file name without the .lz4 extension + filename=$(basename "$lz4_file" .lz4) + + # Extract the .lz4 file to the target subdirectory + lz4 -d "$lz4_file" "$FEATURE_DIR/datastore/$filename" + + # Delete original file (we no longer need this) + rm "$lz4_file" + + echo "Extracted $lz4_file to $FEATURE_DIR/datastore/$filename" + else + echo "No .lz4 files found in $FEATURE_DIR" + fi + done + + # Download sample data JSON-file (required for the API) + wget -q "https://raw.githubusercontent.com/unipept/unipept-database/master/schemas_suffix_array/sampledata.json" -O "$FEATURE_DIR/datastore/sampledata.json" + + # Keep track of the current database version inside of the `.version` file + echo "" +} + +# Start the setup process download_version +setup_database +initialize_datastore diff --git a/unipept-index/~/db_schemas/structure_index_only.sql b/unipept-index/~/db_schemas/structure_index_only.sql new file mode 100644 index 0000000..948ca77 --- /dev/null +++ b/unipept-index/~/db_schemas/structure_index_only.sql @@ -0,0 +1,13 @@ +SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; +SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; +SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL'; + +-- ----------------------------------------------------- +-- Table `unipept`.`uniprot_entries` +-- ----------------------------------------------------- +ALTER TABLE uniprot_entries ADD INDEX fk_uniprot_entries_taxons (taxon_id ASC); +ALTER TABLE uniprot_entries ADD UNIQUE INDEX idx_uniprot_entries_accession (uniprot_accession_number ASC); + +SET SQL_MODE=@OLD_SQL_MODE; +SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; +SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; diff --git a/unipept-index/~/db_schemas/structure_no_index.sql b/unipept-index/~/db_schemas/structure_no_index.sql new file mode 100644 index 0000000..84cd4dd --- /dev/null +++ b/unipept-index/~/db_schemas/structure_no_index.sql @@ -0,0 +1,30 @@ +SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; +SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; +SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL'; + +-- Drop the old database. This database will be recreated further on during this script! +DROP DATABASE IF EXISTS `unipept`; + +CREATE SCHEMA IF NOT EXISTS `unipept` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci ; +USE `unipept` ; + +-- ----------------------------------------------------- +-- Table `unipept`.`uniprot_entries` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `unipept`.`uniprot_entries` ( + `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , + `uniprot_accession_number` CHAR(10) ASCII NOT NULL , + `version` SMALLINT UNSIGNED NOT NULL , + `taxon_id` MEDIUMINT UNSIGNED NOT NULL , + `type` ENUM('swissprot', 'trembl') NOT NULL , + `name`VARCHAR(150) NOT NULL , + `protein` TEXT NOT NULL , + `fa` TEXT NOT NULL , + PRIMARY KEY (`id`)) +ENGINE = InnoDB +DEFAULT CHARACTER SET = ascii +COLLATE = ascii_general_ci; + +SET SQL_MODE=@OLD_SQL_MODE; +SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; +SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;