diff --git a/schemas/structure.sql b/schemas/structure.sql deleted file mode 100644 index a96d48c6..00000000 --- a/schemas/structure.sql +++ /dev/null @@ -1,303 +0,0 @@ -SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; -SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; -SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL'; - -CREATE SCHEMA IF NOT EXISTS `unipept` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci ; -USE `unipept` ; - --- ----------------------------------------------------- --- Table `unipept`.`taxons` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`taxons` ( - `id` MEDIUMINT UNSIGNED NOT NULL , - `name` VARCHAR(120) NOT NULL , - `rank` ENUM('no rank', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'superorder', 'order', 'suborder', 'infraorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species group', 'species subgroup', 'species', 'subspecies', 'strain', 'varietas', 'forma' ) NULL DEFAULT NULL , - `parent_id` MEDIUMINT UNSIGNED NULL DEFAULT NULL , - `valid_taxon` BIT NOT NULL DEFAULT 1 , - PRIMARY KEY (`id`) , - INDEX `fk_taxon_taxon` (`parent_id` ASC) , - CONSTRAINT `fk_taxon_taxon` - FOREIGN KEY (`parent_id` ) - REFERENCES `unipept`.`taxons` (`id` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION) -ENGINE = InnoDB -DEFAULT CHARACTER SET = utf8 -COLLATE = utf8_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`uniprot_entries` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`uniprot_entries` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT, - `uniprot_accession_number` CHAR(10) ASCII NOT NULL, - `version` SMALLINT UNSIGNED NOT NULL, - `taxon_id` MEDIUMINT UNSIGNED NOT NULL, - `type` ENUM('swissprot', 'trembl') NOT NULL, - `name` VARCHAR(150) NOT NULL, - `protein` TEXT NOT NULL, - PRIMARY KEY (`id`), - INDEX `fk_uniprot_entries_taxons_idx` (`taxon_id` ASC), - UNIQUE INDEX `idx_uniprot_entries_accession` (`uniprot_accession_number` ASC), - CONSTRAINT `fk_uniprot_entries_taxons` - FOREIGN KEY (`taxon_id`) - REFERENCES `unipept`.`taxons` (`id`) - ON DELETE NO ACTION - ON UPDATE NO ACTION) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`ec_numbers` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`ec_numbers` ( - `id` SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, - `code` VARCHAR(15) NOT NULL, - `name` VARCHAR(140) NOT NULL, - PRIMARY KEY (`id`), - UNIQUE INDEX `ec_number_UNIQUE` (`code` ASC)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = utf8 -COLLATE = utf8_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`go_terms` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`go_terms` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT, - `code` VARCHAR(15) NOT NULL, - `namespace` ENUM('biological process', 'molecular function', 'cellular component') NOT NULL, - `name` VARCHAR(200) NOT NULL, - PRIMARY KEY (`id`), - UNIQUE INDEX `uidx_code` (`code` ASC)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = utf8 -COLLATE = utf8_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`interpro` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`interpro_entries` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `code` VARCHAR(9) NOT NULL, - `category` VARCHAR(32) NOT NULL, - `name` VARCHAR(160) NOT NULL, - PRIMARY KEY (`id`), - UNIQUE INDEX `idx_interpro_code` (`code` ASC)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`lineages` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`lineages` ( - `taxon_id` MEDIUMINT UNSIGNED NOT NULL, - `superkingdom` MEDIUMINT NULL DEFAULT NULL, - `kingdom` MEDIUMINT NULL DEFAULT NULL, - `subkingdom` MEDIUMINT NULL DEFAULT NULL, - `superphylum` MEDIUMINT NULL DEFAULT NULL, - `phylum` MEDIUMINT NULL DEFAULT NULL, - `subphylum` MEDIUMINT NULL DEFAULT NULL, - `superclass` MEDIUMINT NULL DEFAULT NULL, - `class` MEDIUMINT NULL DEFAULT NULL, - `subclass` MEDIUMINT NULL DEFAULT NULL, - `superorder` MEDIUMINT NULL DEFAULT NULL, - `order` MEDIUMINT NULL DEFAULT NULL, - `suborder` MEDIUMINT NULL DEFAULT NULL, - `infraorder` MEDIUMINT NULL DEFAULT NULL, - `superfamily` MEDIUMINT NULL DEFAULT NULL, - `family` MEDIUMINT NULL DEFAULT NULL, - `subfamily` MEDIUMINT NULL DEFAULT NULL, - `tribe` MEDIUMINT NULL DEFAULT NULL, - `subtribe` MEDIUMINT NULL DEFAULT NULL, - `genus` MEDIUMINT NULL DEFAULT NULL, - `subgenus` MEDIUMINT NULL DEFAULT NULL, - `species_group` MEDIUMINT NULL DEFAULT NULL, - `species_subgroup` MEDIUMINT NULL DEFAULT NULL, - `species` MEDIUMINT NULL DEFAULT NULL, - `subspecies` MEDIUMINT NULL DEFAULT NULL, - `strain` MEDIUMINT NULL DEFAULT NULL, - `varietas` MEDIUMINT NULL DEFAULT NULL, - `forma` MEDIUMINT NULL DEFAULT NULL, - PRIMARY KEY (`taxon_id`), - INDEX `fk_lineages_taxons_idx` (`taxon_id` ASC), - CONSTRAINT `fk_lineages_taxons` - FOREIGN KEY (`taxon_id`) - REFERENCES `unipept`.`taxons` (`id`) - ON DELETE NO ACTION - ON UPDATE NO ACTION) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`sequences` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`sequences` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `sequence` VARCHAR(50) NOT NULL , - `lca` MEDIUMINT UNSIGNED NULL , - `lca_il` MEDIUMINT UNSIGNED NULL , - `fa` BLOB NULL , - `fa_il` BLOB NULL , - PRIMARY KEY (`id`) , - UNIQUE INDEX `uidx_sequence` (`sequence` ASC) , - INDEX `fk_sequences_taxons` (`lca` ASC) , - INDEX `fk_sequences_taxons_2` (`lca_il` ASC) , - CONSTRAINT `fk_sequences_taxons` - FOREIGN KEY (`lca` ) - REFERENCES `unipept`.`taxons` (`id` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION, - CONSTRAINT `fk_sequences_taxons_2` - FOREIGN KEY (`lca_il` ) - REFERENCES `unipept`.`taxons` (`id` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=16 -COLLATE = ascii_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`peptides` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`peptides` ( - `id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT , - `sequence_id` INT UNSIGNED NOT NULL , - `original_sequence_id` INT UNSIGNED NOT NULL , - `uniprot_entry_id` INT UNSIGNED NOT NULL , - PRIMARY KEY (`id`) , - INDEX `fk_peptides_sequences` (`sequence_id` ASC) , - INDEX `fk_peptides_uniprot_entries` (`uniprot_entry_id` ASC) , - INDEX `fk_peptides_original_sequences` (`original_sequence_id` ASC) , - CONSTRAINT `fk_peptides_sequences` - FOREIGN KEY (`sequence_id` ) - REFERENCES `unipept`.`sequences` (`id` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION, - CONSTRAINT `fk_peptides_uniprot_entries` - FOREIGN KEY (`uniprot_entry_id` ) - REFERENCES `unipept`.`uniprot_entries` (`id` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION, - CONSTRAINT `fk_peptides_original_sequences` - FOREIGN KEY (`original_sequence_id` ) - REFERENCES `unipept`.`sequences` (`id` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`datasets` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`datasets` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `environment` VARCHAR(160) NULL , - `reference` VARCHAR(500) NULL , - `url` VARCHAR(200) NULL , - `project_website` VARCHAR(200) NULL , - PRIMARY KEY (`id`) ) -ENGINE = InnoDB -DEFAULT CHARACTER SET = utf8 -COLLATE = utf8_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`dataset_items` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`dataset_items` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `dataset_id` INT UNSIGNED NULL , - `name` VARCHAR(160) NULL , - `data` MEDIUMTEXT CHARACTER SET 'ascii' COLLATE 'ascii_general_ci' NOT NULL , - `order` INT NULL , - PRIMARY KEY (`id`) , - INDEX `fk_dataset_items_datasets` (`dataset_id` ASC) , - CONSTRAINT `fk_dataset_items_datasets` - FOREIGN KEY (`dataset_id` ) - REFERENCES `unipept`.`datasets` (`id` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION) -ENGINE = InnoDB -DEFAULT CHARACTER SET = utf8 -COLLATE = utf8_general_ci; - --- ----------------------------------------------------- --- Table `unipept`.`go_cross_references` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`go_cross_references` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT, - `uniprot_entry_id` INT UNSIGNED NOT NULL, - `go_term_code` VARCHAR(15) NOT NULL, - PRIMARY KEY (`id`), - INDEX `fk_go_reference_uniprot_entries` (`uniprot_entry_id` ASC), - INDEX `fk_go_cross_reference_go_terms_idx` (`go_term_code` ASC), - CONSTRAINT `fk_go_cross_reference_uniprot_entries` - FOREIGN KEY (`uniprot_entry_id`) - REFERENCES `unipept`.`uniprot_entries` (`id`) - ON DELETE NO ACTION - ON UPDATE NO ACTION, - CONSTRAINT `fk_go_cross_reference_go_terms` - FOREIGN KEY (`go_term_code`) - REFERENCES `unipept`.`go_terms` (`code`) - ON DELETE NO ACTION - ON UPDATE NO ACTION) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`ec_cross_references` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`ec_cross_references` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT, - `uniprot_entry_id` INT UNSIGNED NOT NULL, - `ec_number_code` VARCHAR(15) NOT NULL, - PRIMARY KEY (`id`), - INDEX `fk_ec_reference_uniprot_entries` (`uniprot_entry_id` ASC), - INDEX `fk_ec_cross_reference_ec_numbers_idx` (`ec_number_code` ASC), - CONSTRAINT `fk_ec_cross_reference_uniprot_entries` - FOREIGN KEY (`uniprot_entry_id`) - REFERENCES `unipept`.`uniprot_entries` (`id`) - ON DELETE NO ACTION - ON UPDATE NO ACTION, - CONSTRAINT `fk_ec_cross_reference_ec_numbers` - FOREIGN KEY (`ec_number_code`) - REFERENCES `unipept`.`ec_numbers` (`code`) - ON DELETE NO ACTION - ON UPDATE NO ACTION) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`interpro_cross_references` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`interpro_cross_references` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `uniprot_entry_id` INT UNSIGNED NOT NULL , - `interpro_entry_code` VARCHAR(9) NOT NULL , - PRIMARY KEY (`id`), - INDEX `fk_interpro_reference_uniprot_entries` (`uniprot_entry_id` ASC)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; - -SET SQL_MODE=@OLD_SQL_MODE; -SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; -SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; diff --git a/schemas/structure_constraints_only.sql b/schemas/structure_constraints_only.sql new file mode 100644 index 00000000..b17d4c0a --- /dev/null +++ b/schemas/structure_constraints_only.sql @@ -0,0 +1,33 @@ +ALTER TABLE unipept.taxons + ADD PRIMARY KEY (id); +ALTER TABLE unipept.uniprot_entries + ADD PRIMARY KEY (id); +ALTER TABLE unipept.ec_numbers + ADD PRIMARY KEY (id); +ALTER TABLE unipept.go_terms + ADD PRIMARY KEY (id); +ALTER TABLE unipept.interpro_entries + ADD PRIMARY KEY (id); +ALTER TABLE unipept.lineages + ADD PRIMARY KEY (taxon_id); +ALTER TABLE unipept.sequences + ADD PRIMARY KEY (id); +ALTER TABLE unipept.peptides + ADD PRIMARY KEY (id); +ALTER TABLE unipept.datasets + ADD PRIMARY KEY (id); +ALTER TABLE unipept.dataset_items + ADD PRIMARY KEY (id); +ALTER TABLE unipept.dataset_items + ADD CONSTRAINT fk_dataset_items_datasets + FOREIGN KEY (dataset_id) + REFERENCES unipept.datasets (id) + ON DELETE NO ACTION + ON UPDATE NO ACTION; +ALTER TABLE unipept.go_cross_references + ADD PRIMARY KEY (id); +ALTER TABLE unipept.ec_cross_references + ADD PRIMARY KEY (id); +ALTER TABLE unipept.interpro_cross_references + ADD PRIMARY KEY (id); + diff --git a/schemas/structure_no_index.sql b/schemas/structure_no_index.sql index 9bac4112..bd85fd54 100644 --- a/schemas/structure_no_index.sql +++ b/schemas/structure_no_index.sql @@ -1,228 +1,228 @@ -SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; -SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; -SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL'; +SET session_replication_role = "replica"; -- Drop the old database. This database will be recreated further on during this script! -DROP DATABASE IF EXISTS `unipept`; +DROP SCHEMA IF EXISTS "unipept" CASCADE; + +CREATE SCHEMA IF NOT EXISTS "unipept"; + + +-- ----------------------------------------------------- +-- Enums +-- ----------------------------------------------------- +DO +$$ + BEGIN + CREATE TYPE RANK_TYPE AS ENUM ('no rank', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'superorder', 'order', 'suborder', 'infraorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species group', 'species subgroup', 'species', 'subspecies', 'strain', 'varietas', 'forma'); + EXCEPTION + WHEN duplicate_object THEN null; + END +$$; + +DO +$$ + BEGIN + CREATE TYPE DB_TYPE AS ENUM ('swissprot', 'trembl'); + EXCEPTION + WHEN duplicate_object THEN null; + END +$$; + +DO +$$ + BEGIN + CREATE TYPE GO_NAMESPACE AS ENUM ('biological process', 'molecular function', 'cellular component'); + EXCEPTION + WHEN duplicate_object THEN null; + END +$$; -CREATE SCHEMA IF NOT EXISTS `unipept` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci ; -USE `unipept` ; -- ----------------------------------------------------- -- Table `unipept`.`taxons` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`taxons` ( - `id` MEDIUMINT UNSIGNED NOT NULL , - `name` VARCHAR(120) NOT NULL , - `rank` ENUM('no rank', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'superorder', 'order', 'suborder', 'infraorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species group', 'species subgroup', 'species', 'subspecies', 'strain', 'varietas', 'forma' ) NULL DEFAULT NULL , - `parent_id` MEDIUMINT UNSIGNED NULL DEFAULT NULL , - `valid_taxon` BIT NOT NULL DEFAULT 1 , - PRIMARY KEY (`id`)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = utf8 -COLLATE = utf8_general_ci; +CREATE TABLE IF NOT EXISTS "unipept"."taxons" +( + "id" INT NOT NULL PRIMARY KEY, + "name" VARCHAR(120) NOT NULL, + "rank" RANK_TYPE NULL DEFAULT NULL, + "parent_id" INT NULL DEFAULT NULL, + "valid_taxon" SMALLINT NOT NULL DEFAULT 1 +); -- ----------------------------------------------------- -- Table `unipept`.`uniprot_entries` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`uniprot_entries` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `uniprot_accession_number` CHAR(10) ASCII NOT NULL , - `version` SMALLINT UNSIGNED NOT NULL , - `taxon_id` MEDIUMINT UNSIGNED NOT NULL , - `type` ENUM('swissprot', 'trembl') NOT NULL , - `name`VARCHAR(150) NOT NULL , - `protein` TEXT NOT NULL , - PRIMARY KEY (`id`)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; +CREATE TABLE IF NOT EXISTS "unipept"."uniprot_entries" +( + "id" INT NOT NULL PRIMARY KEY, + "uniprot_accession_number" CHAR(10) NOT NULL, + "version" INT NOT NULL, + "taxon_id" INT NOT NULL, + "type" DB_TYPE NOT NULL, + "name" VARCHAR(150) NOT NULL, + "protein" TEXT NOT NULL +); -- ----------------------------------------------------- -- Table `unipept`.`ec_numbers` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`ec_numbers` ( - `id` SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, - `code` VARCHAR(15) NOT NULL, - `name` VARCHAR(155) NOT NULL, - PRIMARY KEY (`id`)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = utf8 -COLLATE = utf8_general_ci; +CREATE TABLE IF NOT EXISTS "unipept"."ec_numbers" +( + "id" INT NOT NULL PRIMARY KEY, + "code" VARCHAR(15) NOT NULL, + "name" VARCHAR(155) NOT NULL +); -- ----------------------------------------------------- -- Table `unipept`.`go_terms` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`go_terms` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT, - `code` VARCHAR(15) NOT NULL, - `namespace` ENUM('biological process', 'molecular function', 'cellular component') NOT NULL, - `name` VARCHAR(200) NOT NULL, - PRIMARY KEY (`id`)) -ENGINE = InnoDB; +CREATE TABLE IF NOT EXISTS "unipept"."go_terms" +( + "id" INT NOT NULL PRIMARY KEY, + "code" VARCHAR(15) NOT NULL, + "namespace" GO_NAMESPACE NOT NULL, + "name" VARCHAR(200) NOT NULL +); -- ----------------------------------------------------- --- Table `unipept`.`interpro` +-- Table `unipept`.`interpro_entries` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`interpro_entries` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `code` VARCHAR(9) NOT NULL, - `category` VARCHAR(32) NOT NULL, - `name` VARCHAR(160) NOT NULL, - PRIMARY KEY (`id`)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; +CREATE TABLE IF NOT EXISTS "unipept"."interpro_entries" +( + "id" INT NOT NULL PRIMARY KEY, + "code" VARCHAR(9) NOT NULL, + "category" VARCHAR(32) NOT NULL, + "name" VARCHAR(160) NOT NULL +); -- ----------------------------------------------------- -- Table `unipept`.`lineages` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`lineages` ( - `taxon_id` MEDIUMINT UNSIGNED NOT NULL , - `superkingdom` MEDIUMINT NULL DEFAULT NULL , - `kingdom` MEDIUMINT NULL DEFAULT NULL , - `subkingdom` MEDIUMINT NULL DEFAULT NULL , - `superphylum` MEDIUMINT NULL DEFAULT NULL , - `phylum` MEDIUMINT NULL DEFAULT NULL , - `subphylum` MEDIUMINT NULL DEFAULT NULL , - `superclass` MEDIUMINT NULL DEFAULT NULL , - `class` MEDIUMINT NULL DEFAULT NULL , - `subclass` MEDIUMINT NULL DEFAULT NULL , - `superorder` MEDIUMINT NULL DEFAULT NULL , - `order` MEDIUMINT NULL DEFAULT NULL , - `suborder` MEDIUMINT NULL DEFAULT NULL , - `infraorder` MEDIUMINT NULL DEFAULT NULL , - `superfamily` MEDIUMINT NULL DEFAULT NULL , - `family` MEDIUMINT NULL DEFAULT NULL , - `subfamily` MEDIUMINT NULL DEFAULT NULL , - `tribe` MEDIUMINT NULL DEFAULT NULL , - `subtribe` MEDIUMINT NULL DEFAULT NULL , - `genus` MEDIUMINT NULL DEFAULT NULL , - `subgenus` MEDIUMINT NULL DEFAULT NULL , - `species_group` MEDIUMINT NULL DEFAULT NULL , - `species_subgroup` MEDIUMINT NULL DEFAULT NULL , - `species` MEDIUMINT NULL DEFAULT NULL , - `subspecies` MEDIUMINT NULL DEFAULT NULL , - `strain` MEDIUMINT NULL DEFAULT NULL , - `varietas` MEDIUMINT NULL DEFAULT NULL , - `forma` MEDIUMINT NULL DEFAULT NULL , - PRIMARY KEY (`taxon_id`)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`sequences` --- BLOCKSIZE 16 IS DEFAULT BLOCKSIZE --- Only BLOBS will get compressed --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`sequences` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `sequence` VARCHAR(50) NOT NULL , - `lca` MEDIUMINT UNSIGNED NULL , - `lca_il` MEDIUMINT UNSIGNED NULL , - `fa` MEDIUMBLOB NULL , - `fa_il` MEDIUMBLOB NULL , - PRIMARY KEY (`id`)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=16 -COLLATE = ascii_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`peptides` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`peptides` ( - `id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT , - `sequence_id` INT UNSIGNED NOT NULL , - `original_sequence_id` INT UNSIGNED NOT NULL , - `uniprot_entry_id` INT UNSIGNED NOT NULL , - PRIMARY KEY (`id`)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`datasets` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`datasets` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `environment` VARCHAR(160) NULL , - `reference` VARCHAR(500) NULL , - `url` VARCHAR(200) NULL , - `project_website` VARCHAR(200) NULL , - PRIMARY KEY (`id`) ) -ENGINE = InnoDB -DEFAULT CHARACTER SET = utf8 -COLLATE = utf8_general_ci; - - --- ----------------------------------------------------- --- Table `unipept`.`dataset_items` --- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`dataset_items` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `dataset_id` INT UNSIGNED NULL , - `name` VARCHAR(160) NULL , - `data` MEDIUMTEXT CHARACTER SET 'ascii' COLLATE 'ascii_general_ci' NOT NULL , - `order` INT NULL , - PRIMARY KEY (`id`) , - INDEX `fk_dataset_items_datasets` (`dataset_id` ASC) , - CONSTRAINT `fk_dataset_items_datasets` - FOREIGN KEY (`dataset_id` ) - REFERENCES `unipept`.`datasets` (`id` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION) -ENGINE = InnoDB -DEFAULT CHARACTER SET = utf8 -COLLATE = utf8_general_ci; +CREATE TABLE IF NOT EXISTS "unipept"."lineages" +( + "taxon_id" INT NOT NULL PRIMARY KEY, + "superkingdom" INT NULL DEFAULT NULL, + "kingdom" INT NULL DEFAULT NULL, + "subkingdom" INT NULL DEFAULT NULL, + "superphylum" INT NULL DEFAULT NULL, + "phylum" INT NULL DEFAULT NULL, + "subphylum" INT NULL DEFAULT NULL, + "superclass" INT NULL DEFAULT NULL, + "class" INT NULL DEFAULT NULL, + "subclass" INT NULL DEFAULT NULL, + "superorder" INT NULL DEFAULT NULL, + "order" INT NULL DEFAULT NULL, + "suborder" INT NULL DEFAULT NULL, + "infraorder" INT NULL DEFAULT NULL, + "superfamily" INT NULL DEFAULT NULL, + "family" INT NULL DEFAULT NULL, + "subfamily" INT NULL DEFAULT NULL, + "tribe" INT NULL DEFAULT NULL, + "subtribe" INT NULL DEFAULT NULL, + "genus" INT NULL DEFAULT NULL, + "subgenus" INT NULL DEFAULT NULL, + "species_group" INT NULL DEFAULT NULL, + "species_subgroup" INT NULL DEFAULT NULL, + "species" INT NULL DEFAULT NULL, + "subspecies" INT NULL DEFAULT NULL, + "strain" INT NULL DEFAULT NULL, + "varietas" INT NULL DEFAULT NULL, + "forma" INT NULL DEFAULT NULL +); + + +-- ----------------------------------------------------- +-- Table "unipept"."sequences" +CREATE TABLE IF NOT EXISTS "unipept"."sequences" +( + "id" BIGINT NOT NULL PRIMARY KEY, + "sequence" VARCHAR(50) NOT NULL, + "lca" INT NULL, + "lca_il" INT NULL, + "fa" BYTEA NULL, + "fa_il" BYTEA NULL +); + + +-- ----------------------------------------------------- +-- Table "unipept"."peptides" +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."peptides" +( + "id" BIGINT NOT NULL PRIMARY KEY, + "sequence_id" BIGINT NOT NULL, + "original_sequence_id" BIGINT NOT NULL, + "uniprot_entry_id" BIGINT NOT NULL +); + + +-- ----------------------------------------------------- +-- Table "unipept"."datasets" +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."datasets" +( + "id" INT NOT NULL PRIMARY KEY, + "environment" VARCHAR(160) NULL, + "reference" VARCHAR(500) NULL, + "url" VARCHAR(200) NULL, + "project_website" VARCHAR(200) NULL +); + + +-- ----------------------------------------------------- +-- Table "unipept"."dataset_items" +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."dataset_items" +( + "id" INT NOT NULL PRIMARY KEY, + "dataset_id" BIGINT NULL, + "name" VARCHAR(160) NULL, + "data" TEXT NOT NULL, + "order" INT NULL, + CONSTRAINT "fk_dataset_items_datasets" + FOREIGN KEY ("dataset_id") + REFERENCES "unipept"."datasets" ("id") + ON DELETE NO ACTION + ON UPDATE NO ACTION +); + -- ----------------------------------------------------- -- Table `unipept`.`go_cross_references` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`go_cross_references` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `uniprot_entry_id` INT UNSIGNED NOT NULL , - `go_term_code` VARCHAR(15) NOT NULL , - PRIMARY KEY (`id`)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; +CREATE TABLE IF NOT EXISTS "unipept"."go_cross_references" +( + "id" BIGINT NOT NULL PRIMARY KEY, + "uniprot_entry_id" BIGINT NOT NULL, + "go_term_code" VARCHAR(15) NOT NULL +); -- ----------------------------------------------------- --- Table `unipept`.`ec_cross_references` +-- Table "unipept"."ec_cross_references" -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`ec_cross_references` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `uniprot_entry_id` INT UNSIGNED NOT NULL , - `ec_number_code` VARCHAR(15) NOT NULL , - PRIMARY KEY (`id`)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; +CREATE TABLE IF NOT EXISTS "unipept"."ec_cross_references" +( + "id" BIGINT NOT NULL PRIMARY KEY, + "uniprot_entry_id" BIGINT NOT NULL, + "ec_number_code" VARCHAR(15) NOT NULL +); + -- ----------------------------------------------------- --- Table `unipept`.`interpro_cross_references` +-- Table "unipept"."taxon_cross_references" -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `unipept`.`interpro_cross_references` ( - `id` INT UNSIGNED NOT NULL AUTO_INCREMENT , - `uniprot_entry_id` INT UNSIGNED NOT NULL , - `interpro_entry_code` VARCHAR(9) NOT NULL , - PRIMARY KEY (`id`)) -ENGINE = InnoDB -DEFAULT CHARACTER SET = ascii -COLLATE = ascii_general_ci; +CREATE TABLE IF NOT EXISTS "unipept"."interpro_cross_references" +( + "id" BIGINT NOT NULL PRIMARY KEY, + "uniprot_entry_id" BIGINT NOT NULL, + "interpro_entry_code" VARCHAR(9) NOT NULL +); -SET SQL_MODE=@OLD_SQL_MODE; -SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; -SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; +SET session_replication_role = "origin"; diff --git a/schemas/structure_no_index_no_constraints.sql b/schemas/structure_no_index_no_constraints.sql new file mode 100644 index 00000000..9b448229 --- /dev/null +++ b/schemas/structure_no_index_no_constraints.sql @@ -0,0 +1,223 @@ +SET session_replication_role = "replica"; + +-- Drop the old database. This database will be recreated further on during this script! +DROP SCHEMA IF EXISTS "unipept" CASCADE; + +CREATE SCHEMA IF NOT EXISTS "unipept"; + + +-- ----------------------------------------------------- +-- Enums +-- ----------------------------------------------------- +DO +$$ + BEGIN + CREATE TYPE RANK_TYPE AS ENUM ('no rank', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'superorder', 'order', 'suborder', 'infraorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species group', 'species subgroup', 'species', 'subspecies', 'strain', 'varietas', 'forma'); + EXCEPTION + WHEN duplicate_object THEN null; + END +$$; + +DO +$$ + BEGIN + CREATE TYPE DB_TYPE AS ENUM ('swissprot', 'trembl'); + EXCEPTION + WHEN duplicate_object THEN null; + END +$$; + +DO +$$ + BEGIN + CREATE TYPE GO_NAMESPACE AS ENUM ('biological process', 'molecular function', 'cellular component'); + EXCEPTION + WHEN duplicate_object THEN null; + END +$$; + + +-- ----------------------------------------------------- +-- Table `unipept`.`taxons` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."taxons" +( + "id" INT NOT NULL, + "name" VARCHAR(120) NOT NULL, + "rank" RANK_TYPE NULL DEFAULT NULL, + "parent_id" INT NULL DEFAULT NULL, + "valid_taxon" SMALLINT NOT NULL DEFAULT 1 +); + + +-- ----------------------------------------------------- +-- Table `unipept`.`uniprot_entries` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."uniprot_entries" +( + "id" INT NOT NULL, + "uniprot_accession_number" CHAR(10) NOT NULL, + "version" INT NOT NULL, + "taxon_id" INT NOT NULL, + "type" DB_TYPE NOT NULL, + "name" VARCHAR(150) NOT NULL, + "protein" TEXT NOT NULL +); + + +-- ----------------------------------------------------- +-- Table `unipept`.`ec_numbers` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."ec_numbers" +( + "id" INT NOT NULL, + "code" VARCHAR(15) NOT NULL, + "name" VARCHAR(155) NOT NULL +); + + +-- ----------------------------------------------------- +-- Table `unipept`.`go_terms` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."go_terms" +( + "id" INT NOT NULL, + "code" VARCHAR(15) NOT NULL, + "namespace" GO_NAMESPACE NOT NULL, + "name" VARCHAR(200) NOT NULL +); + + +-- ----------------------------------------------------- +-- Table `unipept`.`interpro_entries` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."interpro_entries" +( + "id" INT NOT NULL, + "code" VARCHAR(9) NOT NULL, + "category" VARCHAR(32) NOT NULL, + "name" VARCHAR(160) NOT NULL +); + + +-- ----------------------------------------------------- +-- Table `unipept`.`lineages` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."lineages" +( + "taxon_id" INT NOT NULL, + "superkingdom" INT NULL DEFAULT NULL, + "kingdom" INT NULL DEFAULT NULL, + "subkingdom" INT NULL DEFAULT NULL, + "superphylum" INT NULL DEFAULT NULL, + "phylum" INT NULL DEFAULT NULL, + "subphylum" INT NULL DEFAULT NULL, + "superclass" INT NULL DEFAULT NULL, + "class" INT NULL DEFAULT NULL, + "subclass" INT NULL DEFAULT NULL, + "superorder" INT NULL DEFAULT NULL, + "order" INT NULL DEFAULT NULL, + "suborder" INT NULL DEFAULT NULL, + "infraorder" INT NULL DEFAULT NULL, + "superfamily" INT NULL DEFAULT NULL, + "family" INT NULL DEFAULT NULL, + "subfamily" INT NULL DEFAULT NULL, + "tribe" INT NULL DEFAULT NULL, + "subtribe" INT NULL DEFAULT NULL, + "genus" INT NULL DEFAULT NULL, + "subgenus" INT NULL DEFAULT NULL, + "species_group" INT NULL DEFAULT NULL, + "species_subgroup" INT NULL DEFAULT NULL, + "species" INT NULL DEFAULT NULL, + "subspecies" INT NULL DEFAULT NULL, + "strain" INT NULL DEFAULT NULL, + "varietas" INT NULL DEFAULT NULL, + "forma" INT NULL DEFAULT NULL +); + + +-- ----------------------------------------------------- +-- Table "unipept"."sequences" +CREATE TABLE IF NOT EXISTS "unipept"."sequences" +( + "id" BIGINT NOT NULL, + "sequence" VARCHAR(50) NOT NULL, + "lca" INT NULL, + "lca_il" INT NULL, + "fa" BYTEA NULL, + "fa_il" BYTEA NULL +); + + +-- ----------------------------------------------------- +-- Table "unipept"."peptides" +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."peptides" +( + "id" BIGINT NOT NULL, + "sequence_id" BIGINT NOT NULL, + "original_sequence_id" BIGINT NOT NULL, + "uniprot_entry_id" BIGINT NOT NULL +); + + +-- ----------------------------------------------------- +-- Table "unipept"."datasets" +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."datasets" +( + "id" INT NOT NULL, + "environment" VARCHAR(160) NULL, + "reference" VARCHAR(500) NULL, + "url" VARCHAR(200) NULL, + "project_website" VARCHAR(200) NULL +); + + +-- ----------------------------------------------------- +-- Table "unipept"."dataset_items" +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."dataset_items" +( + "id" INT NOT NULL, + "dataset_id" BIGINT NULL, + "name" VARCHAR(160) NULL, + "data" TEXT NOT NULL, + "order" INT NULL +); + + +-- ----------------------------------------------------- +-- Table `unipept`.`go_cross_references` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."go_cross_references" +( + "id" BIGINT NOT NULL, + "uniprot_entry_id" BIGINT NOT NULL, + "go_term_code" VARCHAR(15) NOT NULL +); + + +-- ----------------------------------------------------- +-- Table "unipept"."ec_cross_references" +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."ec_cross_references" +( + "id" BIGINT NOT NULL, + "uniprot_entry_id" BIGINT NOT NULL, + "ec_number_code" VARCHAR(15) NOT NULL +); + + +-- ----------------------------------------------------- +-- Table "unipept"."taxon_cross_references" +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS "unipept"."interpro_cross_references" +( + "id" BIGINT NOT NULL, + "uniprot_entry_id" BIGINT NOT NULL, + "interpro_entry_code" VARCHAR(9) NOT NULL +); + + +SET session_replication_role = "origin"; diff --git a/schemas/user_data.sql b/schemas/user_data.sql deleted file mode 100644 index df9c4128..00000000 --- a/schemas/user_data.sql +++ /dev/null @@ -1,10 +0,0 @@ --- ----------------------------------------------------- --- Data for table `unipept`.`users` --- ----------------------------------------------------- -START TRANSACTION; -USE `unipept`; -INSERT INTO `unipept`.`users` (`id`, `username`, `admin`) VALUES (1, 'bmesuere', 1); -INSERT INTO `unipept`.`users` (`id`, `username`, `admin`) VALUES (2, 'pdawyndt', 1); -INSERT INTO `unipept`.`users` (`id`, `username`, `admin`) VALUES (3, 'guest', 0); - -COMMIT; diff --git a/scripts/build_database.sh b/scripts/build_database.sh index 695d7e42..9a639ed5 100755 --- a/scripts/build_database.sh +++ b/scripts/build_database.sh @@ -696,7 +696,9 @@ fetch_ec_numbers() { /^DE/ { gsub(/.$/, "", $2) name = name $2 } END { print id, name }' - } | cat -n | sed 's/^ *//' | $CMD_LZ4 - > "$OUTPUT_DIR/ec_numbers.tsv.lz4" + } | cat -n \ + | awk -F'\t' 'BEGIN {OFS=FS} { if (length($3) > 155) $3 = substr($3, 1, 155); print }' - \ + | sed 's/^ *//' | $CMD_LZ4 - > "$OUTPUT_DIR/ec_numbers.tsv.lz4" log "Finished creating EC numbers." } diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs index e197f228..b4dd12e8 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs @@ -138,17 +138,22 @@ impl TaxonList { Ok(taxon.valid) } - pub fn write_taxons(&self, pb: &PathBuf) -> Result<()> { + pub fn write_taxons(&mut self, pb: &PathBuf) -> Result<()> { let mut writer = open_write(pb).context("Unable to open taxon output file")?; - for (id, taxon) in self.entries.iter().enumerate() { + for (id, taxon) in self.entries.iter_mut().enumerate() { let taxon = if let Some(t) = taxon { t } else { continue; }; - let valid = if taxon.valid { '\u{0001}' } else { '\u{0000}' }; + let valid = if taxon.valid { '1' } else { '0' }; + + // Cap the name to the limit of the VARCHAR in the database + // note that we can't do this earlier on because the name may be used + // in validate() + taxon.name.truncate(120); writeln!( &mut writer, diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs index 6f3298d6..d626004e 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs @@ -28,7 +28,7 @@ impl Entry { type_: String, accession_number: String, sequence: String, - name: String, + mut name: String, version: String, taxon_id: String, ec_references: Vec, @@ -39,6 +39,9 @@ impl Entry { .parse() .with_context(|| format!("Failed to parse {} to i32", taxon_id))?; + // Cap the name to the limit of the VARCHAR in the database + name.truncate(150); + Ok(Entry { min_length, max_length, diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs index d92b9a15..de6a6c41 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs @@ -154,12 +154,16 @@ impl TableWriter { { self.uniprot_count += 1; - let accession_number = &entry.accession_number; - let version = entry.version.clone(); + let accession_number = if entry.accession_number.len() <= 10 { + &entry.accession_number + } else { + &entry.accession_number[..10] + }; + let version = &entry.version; let taxon_id = entry.taxon_id; - let type_ = entry.type_.clone(); - let name = entry.name.clone(); - let sequence = entry.sequence.clone(); + let type_ = &entry.type_; + let name = &entry.name; + let sequence = &entry.sequence; writeln!( &mut self.uniprot_entries, diff --git a/scripts/parallel_index.sh b/scripts/parallel_index.sh index 06397e45..a8c01789 100755 --- a/scripts/parallel_index.sh +++ b/scripts/parallel_index.sh @@ -1,9 +1,8 @@ #!/bin/bash -# Define MySQL connection parameters +# Define PSQL connection parameters DB_USER="unipept" DB_PASSWORD="unipept" -DB_HOST="localhost" DB_NAME="unipept" # Function to add an index in the background @@ -12,7 +11,7 @@ add_index() { local column_name=$2 # Execute the "add index" statement - mariadb -u "$DB_USER" -p"$DB_PASSWORD" -h "$DB_HOST" -e "ALTER TABLE $DB_NAME.$table_name ADD INDEX idx_$column_name ($column_name);" & + PGPASSWORD="$DB_PASSWORD" psql -U "$DB_USER" -c "CREATE INDEX idx_${table_name}_$column_name ON $DB_NAME.$table_name($column_name);" & } # List of tables and columns for which you want to add indexes diff --git a/scripts/parallel_load.sh b/scripts/parallel_load.sh index fcbfc571..e87e00f8 100755 --- a/scripts/parallel_load.sh +++ b/scripts/parallel_load.sh @@ -1,8 +1,9 @@ shopt -s expand_aliases -export db=unipept -export user=root -export pass=unipept +# Define PSQL connection parameters +export DB_USER="unipept" +export DB_PASSWORD="unipept" +export DB_NAME="unipept" dir="$1" @@ -10,7 +11,16 @@ function load_table() { file=$1 tbl=`echo $file | sed "s/.tsv.lz4//"` echo "lz4catting - LOAD DATA LOCAL INFILE '$file' INTO TABLE $tbl" - lz4 -dc $file | mysql --local-infile=1 -u$user -p$pass $db -e "LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE $tbl;SHOW WARNINGS" 2>&1 + + # Remove the last two columns of the peptides file + { + if [ "$tbl" == "peptides" ] + then + lz4cat $file | awk 'BEGIN {FS = OFS = "\t"} {NF-=2; print}' - + else + lz4cat $file + fi + } | PGPASSWORD=$DB_PASSWORD psql -U $DB_USER -c "COPY $DB_NAME.$tbl FROM STDIN WITH (FORMAT TEXT, DELIMITER E'\t', HEADER false, NULL '\N');" 2>&1 } export -f load_table