diff --git a/libsais64-rs/builder.rs b/libsais64-rs/builder.rs index 156cff9..78851a1 100644 --- a/libsais64-rs/builder.rs +++ b/libsais64-rs/builder.rs @@ -5,7 +5,10 @@ use std::{ Display, Formatter }, - path::{Path, PathBuf}, + path::{ + Path, + PathBuf + }, process::{ Command, ExitStatus @@ -58,7 +61,10 @@ fn exit_status_to_result(name: &str, exit_status: ExitStatus) -> Result<(), Comp fn main() -> Result<(), Box> { // remove the old libsais folder - Command::new("rm").args(["-rf", "libsais"]).status().unwrap_or_default(); // if removing fails, it is since the folder did not exist, we just can ignore it + Command::new("rm") + .args(["-rf", "libsais"]) + .status() + .unwrap_or_default(); // if removing fails, it is since the folder did not exist, we just can ignore it // clone the c library Command::new("git") diff --git a/sa-builder/build-compressed.pbs b/sa-builder/build-compressed.pbs new file mode 100644 index 0000000..9a1adb3 --- /dev/null +++ b/sa-builder/build-compressed.pbs @@ -0,0 +1,41 @@ +#!/bin/bash + +######################################################################################################### +### This script is designed to run on the Ghent university HPC ### +### ### +### how to use: ### +### 1) Swap to the high-memory gallade cluster by executing `module swap cluster/gallade` ### +### 2) Navigate the to root of the project ### +### 3) Submit the job to the queue with `qsub sa-builder/build-compressed.pbs` ### +######################################################################################################### + +# go to cluster with high memory +module swap cluster/gallade + +# define requested memory, cpu resources and email notifications +#PBS -m abe +#PBS -l walltime=6:00:00 +#PBS -l mem=750gb +# ask for 1 node, 1 cpu (not more needed since we don't have parallelism) +#PBS -l nodes=1:ppn=all +#PBS -N suffix_array_construction_uniprot + +# define output and error files +#PBS -o stdout.$PBS_JOBID +#PBS -e stderr.$PBS_JOBID + +prefix="/kyukon/data/gent/vo/000/gvo00038/suffix-array" + +# load Rust +module load Rust/1.75.0-GCCcore-12.3.0 +module load Clang/16.0.6-GCCcore-12.3.0 # needed to build the bindings from Rust to C +module load CMake/3.26.3-GCCcore-12.3.0 + +# go to current working dir and execute +cd $PBS_O_WORKDIR + +# compile +cargo build --release + +# execute +./target/release/sa-builder -d "$prefix"/proteins.tsv -t "$prefix"/taxons.tsv -o "$prefix"/sa_sparse3_compressed.bin -s 3 -a lib-div-suf-sort -c diff --git a/sa-builder/build_suffix_array.pbs b/sa-builder/build.pbs similarity index 79% rename from sa-builder/build_suffix_array.pbs rename to sa-builder/build.pbs index fac37e6..1725281 100644 --- a/sa-builder/build_suffix_array.pbs +++ b/sa-builder/build.pbs @@ -6,7 +6,7 @@ ### how to use: ### ### 1) Swap to the high-memory gallade cluster by executing `module swap cluster/gallade` ### ### 2) Navigate the to root of the project ### -### 3) Submit the job to the queue with `qsub suffixarray/build_suffix_array.pbs` ### +### 3) Submit the job to the queue with `qsub sa-builder/build.pbs` ### ######################################################################################################### # go to cluster with high memory @@ -17,14 +17,14 @@ module swap cluster/gallade #PBS -l walltime=10:00:00 #PBS -l mem=750gb # ask for 1 node, 1 cpu (not more needed since we don't have parallelism) -#PBS -l nodes=1:ppn=1 +#PBS -l nodes=1:ppn=all #PBS -N suffix_array_construction_uniprot # define output and error files #PBS -o stdout.$PBS_JOBID #PBS -e stderr.$PBS_JOBID -prefix="$VSC_DATA_VO/bram/" +prefix="/kyukon/data/gent/vo/000/gvo00038/suffix-array" # load Rust module load Rust/1.75.0-GCCcore-12.3.0 @@ -38,4 +38,4 @@ cd $PBS_O_WORKDIR cargo build --release # execute -./target/release/suffixarray_builder -d "$prefix"uniprot_protein_database_minimal.tsv -t "$prefix"taxons.tsv --sparseness-factor 3 --construction-algorithm lib-div-suf-sort -o "$prefix"uniprot_suffix_array_sparse3.bin +./target/release/sa-builder -d "$prefix"/proteins.tsv -t "$prefix"/taxons.tsv -o "$prefix"/sa_sparse3.bin -s 3 -a lib-div-suf-sort diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 812bd7d..f458c47 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -3,7 +3,12 @@ use std::{ File, OpenOptions }, - io::Result + io::BufWriter, + time::{ + SystemTime, + SystemTimeError, + UNIX_EPOCH + } }; use clap::Parser; @@ -33,33 +38,46 @@ fn main() { eprintln!(); eprintln!("πŸ“‹ Started loading the taxon file..."); + let start_taxon_time = get_time_ms().unwrap(); let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); - eprintln!("βœ… Successfully loaded the taxon file!"); + eprintln!( + "βœ… Successfully loaded the taxon file in {} seconds!", + (get_time_ms().unwrap() - start_taxon_time) / 1000.0 + ); eprintln!("\tAggregation method: LCA*"); eprintln!(); eprintln!("πŸ“‹ Started loading the proteins..."); + let start_proteins_time = get_time_ms().unwrap(); let mut data = Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); - eprintln!("βœ… Successfully loaded the proteins!"); + eprintln!( + "βœ… Successfully loaded the proteins in {} seconds!", + (get_time_ms().unwrap() - start_proteins_time) / 1000.0 + ); eprintln!(); eprintln!("πŸ“‹ Started building the suffix array..."); + let start_ssa_time = get_time_ms().unwrap(); let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); - eprintln!("βœ… Successfully built the suffix array!"); + eprintln!( + "βœ… Successfully built the suffix array in {} seconds!", + (get_time_ms().unwrap() - start_ssa_time) / 1000.0 + ); eprintln!("\tAmount of items: {}", sa.len()); eprintln!("\tSample rate: {}", sparseness_factor); // open the output file - let mut file = - open_file(&output).unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + let mut file = open_file_buffer(&output, 100 * 1024 * 1024) + .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); eprintln!(); eprintln!("πŸ“‹ Started dumping the suffix array..."); + let start_dump_time = get_time_ms().unwrap(); if compress_sa { let bits_per_value = (data.len() as f64).log2().ceil() as usize; @@ -70,27 +88,39 @@ fn main() { eprint_and_exit(err.to_string().as_str()); }; - eprintln!("βœ… Successfully dumped the suffix array!"); + eprintln!( + "βœ… Successfully dumped the suffix array in {} seconds!", + (get_time_ms().unwrap() - start_dump_time) / 1000.0 + ); eprintln!("\tAmount of bits per item: {}", bits_per_value); } else { if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) { eprint_and_exit(err.to_string().as_str()); } - eprintln!("βœ… Successfully dumped the suffix array!"); + eprintln!( + "βœ… Successfully dumped the suffix array in {}Β seconds!", + (get_time_ms().unwrap() - start_dump_time) / 1000.0 + ); eprintln!("\tAmount of bits per item: 64"); } } -fn open_file(file: &str) -> Result { - OpenOptions::new() +fn open_file_buffer(file: &str, buffer_size: usize) -> std::io::Result> { + let file = OpenOptions::new() .create(true) .write(true) .truncate(true) // if the file already exists, empty the file - .open(file) + .open(file)?; + + Ok(BufWriter::with_capacity(buffer_size, file)) } fn eprint_and_exit(err: &str) -> ! { eprintln!("{}", err); std::process::exit(1); } + +pub fn get_time_ms() -> Result { + Ok(SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos() as f64 * 1e-6) +}