Skip to content

Commit

Permalink
A lot of tweaking and improving of the unipept-index feature
Browse files Browse the repository at this point in the history
  • Loading branch information
pverscha committed Aug 12, 2024
1 parent cc0019e commit 22f62a7
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 11 deletions.
97 changes: 86 additions & 11 deletions unipept-index/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,27 @@ trap 'error_exit "An unexpected error occurred."' ERR
# Define variables
FEATURE_DIR="/unipept-index-data"
VERSION_OPTION="${VERSION:-latest}"
GITHUB_API="https://api.github.com/repos/unipept/unipept-index/releases"
GITHUB_API_INDEX="https://api.github.com/repos/unipept/unipept-index/releases"
GITHUB_API_DATABASE="https://api.github.com/repos/unipept/unipept-database/releases"

# Create the feature directory if it doesn't exist
mkdir -p "$FEATURE_DIR"

# Function to get releases from GitHub
get_releases() {
curl -s "$GITHUB_API" | jq -r '.[] | .tag_name' || error_exit "Failed to retrieve releases from GitHub."
curl -s "$GITHUB_API_INDEX" | jq -r '.[] | .tag_name' || error_exit "Failed to retrieve releases from GitHub."
}

# Function to download and extract the specified version
download_and_extract() {
local version=$1
local version="$1"
local zip_file_name="$2"
local github_url="$3"
local release_url
local zip_file_name
local zip_file

# Construct the expected ZIP file name based on the version date
zip_file_name="index_SP_${version}.zip"

# Get the release URL for the specific ZIP file
release_url=$(curl -s "$GITHUB_API" | jq -r --arg zip_name "$zip_file_name" '.[] | .assets[] | select(.name == $zip_name) | .browser_download_url')
release_url=$(curl -s "$github_url" | jq -r --arg zip_name "$zip_file_name" --arg date "$version" '.[] | .assets[] | select(.created_at | contains($date)) | select(.name == $zip_name) | .browser_download_url')

# Check if release URL is found
if [ -z "$release_url" ]; then
Expand Down Expand Up @@ -89,17 +88,93 @@ download_version() {
# Extract the date part from the latest version
latest_version_date=${latest_version#*SP_}; latest_version_date=${latest_version_date%.zip}

download_and_extract "$latest_version_date"
# First, download the zip file containing the index files
download_and_extract "$latest_version_date" "index_SP_${latest_version_date}.zip" "$GITHUB_API_INDEX"
# Then, download the zip file containing the database files
download_and_extract "$latest_version_date" "suffix-array.zip" "$GITHUB_API_DATABASE"
echo "Successfully downloaded and extracted the latest version: $latest_version_date"
else
# Attempt to download the specified version
download_and_extract "$VERSION_OPTION" || {
download_and_extract "$VERSION_OPTION" "index_SP_${VERSION_OPTION}.zip" "$GITHUB_API_INDEX" || {
echo "No release available for the specified date: $VERSION_OPTION"
list_last_10_releases
exit 1
}
download_and_extract "$VERSION_OPTION" "suffix-array.zip" "$GITHUB_API_DATABASE"
echo "Successfully downloaded and extracted version: $VERSION_OPTION"
fi
}

# Call the function to download the specified version
DB_TMP_DIR="~/db_schemas/"
DB_USER="root"
DB_PASSWORD="root_pass"

# We also need to install and setup a small MySQL database that requires the UniProt-entries to be loaded in before-
# hand. There's will be used by the Unipept API to retrieve functional annotations and other metadata.
setup_database() {
echo "Started constructing database..."

# First, download the database schemas that are required for the suffix array
mkdir -p "$DB_TMP_DIR"

# Download the database schema
wget -q "https://raw.githubusercontent.com/unipept/unipept-database/master/schemas_suffix_array/structure_no_index.sql" -O "$DB_TMP_DIR/structure_no_index.sql"
# Download an SQL-file that starts building indices for the database
wget -q "https://raw.githubusercontent.com/unipept/unipept-database/master/schemas_suffix_array/structure_index_only.sql" -O "$DB_TMP_DIR/structure_index_only.sql"

# Install mariadb-server from apt without user interaction
export DEBIAN_FRONTEND="noninteractive"
sudo debconf-set-selections <<< "mariadb-server mysql-server/root_password password $DB_PASSWORD"
sudo debconf-set-selections <<< "mariadb-server mysql-server/root_password_again password $DB_PASSWORD"

apt update && apt install -y lz4 mariadb-server

# Start MariaDB service
service mariadb start

# Import the SQL files into the database
mysql -uroot -p"$DB_PASSWORD" < "$DB_TMP_DIR/structure_no_index.sql"

# Load the UniProt-entries into the database
lz4 -dcfm "$FEATURE_DIR/uniprot_entries.tsv.lz4" | mariadb --local-infile=1 -uroot -p"$DB_PASSWORD" unipept -e "LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE uniprot_entries;SHOW WARNINGS" 2>&1

# Build the database indices
mysql -uroot -p"$DB_PASSWORD" unipept < "$DB_TMP_DIR/structure_index_only.sql"

echo "Constructing database finished..."
}

# Correctly move and extract the files required for the datastore used by the Unipept API.
initialize_datastore() {
mkdir -p "$FEATURE_DIR/datastore"

# Iterate over each .lz4 file in the source directory
for lz4_file in "$FEATURE_DIR"/*.tsv.lz4; do
# Check if the file exists (in case no .lz4 files are present)
if [[ -f "$lz4_file" ]]; then
# Extract the file name without the .lz4 extension
filename=$(basename "$lz4_file" .lz4)

# Extract the .lz4 file to the target subdirectory
lz4 -d "$lz4_file" "$FEATURE_DIR/datastore/$filename"

# Delete original file (we no longer need this)
rm "$lz4_file"

echo "Extracted $lz4_file to $FEATURE_DIR/datastore/$filename"
else
echo "No .lz4 files found in $FEATURE_DIR"
fi
done

# Download sample data JSON-file (required for the API)
wget -q "https://raw.githubusercontent.com/unipept/unipept-database/master/schemas_suffix_array/sampledata.json" -O "$FEATURE_DIR/datastore/sampledata.json"

# Keep track of the current database version inside of the `.version` file
echo ""
}

# Start the setup process
download_version
setup_database
initialize_datastore
13 changes: 13 additions & 0 deletions unipept-index/~/db_schemas/structure_index_only.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL';

-- -----------------------------------------------------
-- Table `unipept`.`uniprot_entries`
-- -----------------------------------------------------
ALTER TABLE uniprot_entries ADD INDEX fk_uniprot_entries_taxons (taxon_id ASC);
ALTER TABLE uniprot_entries ADD UNIQUE INDEX idx_uniprot_entries_accession (uniprot_accession_number ASC);

SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;
30 changes: 30 additions & 0 deletions unipept-index/~/db_schemas/structure_no_index.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL';

-- Drop the old database. This database will be recreated further on during this script!
DROP DATABASE IF EXISTS `unipept`;

CREATE SCHEMA IF NOT EXISTS `unipept` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci ;
USE `unipept` ;

-- -----------------------------------------------------
-- Table `unipept`.`uniprot_entries`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `unipept`.`uniprot_entries` (
`id` INT UNSIGNED NOT NULL AUTO_INCREMENT ,
`uniprot_accession_number` CHAR(10) ASCII NOT NULL ,
`version` SMALLINT UNSIGNED NOT NULL ,
`taxon_id` MEDIUMINT UNSIGNED NOT NULL ,
`type` ENUM('swissprot', 'trembl') NOT NULL ,
`name`VARCHAR(150) NOT NULL ,
`protein` TEXT NOT NULL ,
`fa` TEXT NOT NULL ,
PRIMARY KEY (`id`))
ENGINE = InnoDB
DEFAULT CHARACTER SET = ascii
COLLATE = ascii_general_ci;

SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;

0 comments on commit 22f62a7

Please sign in to comment.