diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 0a5f647..056c211 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -168,7 +168,7 @@ impl BitArray { /// /// A `Result` indicating whether the write operation was successful or not. pub fn data_to_writer( - data: Vec, + data: &Vec, bits_per_value: usize, max_capacity: usize, writer: &mut impl Write @@ -314,7 +314,7 @@ mod tests { let data = vec![0x1234567890, 0xabcdef0123, 0x4567890abc, 0xdef0123456]; let mut writer = Vec::new(); - data_to_writer(data, 40, 2, &mut writer).unwrap(); + data_to_writer(&data, 40, 2, &mut writer).unwrap(); assert_eq!( writer, @@ -341,7 +341,7 @@ mod tests { ]; let mut writer = Vec::new(); - data_to_writer(data, 32, 8, &mut writer).unwrap(); + data_to_writer(&data, 32, 8, &mut writer).unwrap(); assert_eq!( writer, @@ -380,7 +380,7 @@ mod tests { ]; let mut writer = Vec::new(); - data_to_writer(data, 32, 8, &mut writer).unwrap(); + data_to_writer(&data, 32, 8, &mut writer).unwrap(); assert_eq!( writer, diff --git a/sa-builder/build_suffix_array.pbs b/sa-builder/build-compressed.pbs similarity index 86% rename from sa-builder/build_suffix_array.pbs rename to sa-builder/build-compressed.pbs index fac37e6..2f80cc1 100644 --- a/sa-builder/build_suffix_array.pbs +++ b/sa-builder/build-compressed.pbs @@ -24,7 +24,7 @@ module swap cluster/gallade #PBS -o stdout.$PBS_JOBID #PBS -e stderr.$PBS_JOBID -prefix="$VSC_DATA_VO/bram/" +prefix="/kyukon/data/gent/vo/000/gvo00038/suffix-array" # load Rust module load Rust/1.75.0-GCCcore-12.3.0 @@ -38,4 +38,4 @@ cd $PBS_O_WORKDIR cargo build --release # execute -./target/release/suffixarray_builder -d "$prefix"uniprot_protein_database_minimal.tsv -t "$prefix"taxons.tsv --sparseness-factor 3 --construction-algorithm lib-div-suf-sort -o "$prefix"uniprot_suffix_array_sparse3.bin +./target/release/sa-builder -d "$prefix"/proteins.tsv -t "$prefix"/taxons.tsv -o "$prefix"/sa_sparse3_compressed.bin -s 3 -a lib-div-suf-sort -c diff --git a/sa-builder/build.pbs b/sa-builder/build.pbs new file mode 100644 index 0000000..b1b8d2c --- /dev/null +++ b/sa-builder/build.pbs @@ -0,0 +1,41 @@ +#!/bin/bash + +######################################################################################################### +### This script is designed to run on the Ghent university HPC ### +### ### +### how to use: ### +### 1) Swap to the high-memory gallade cluster by executing `module swap cluster/gallade` ### +### 2) Navigate the to root of the project ### +### 3) Submit the job to the queue with `qsub suffixarray/build_suffix_array.pbs` ### +######################################################################################################### + +# go to cluster with high memory +module swap cluster/gallade + +# define requested memory, cpu resources and email notifications +#PBS -m abe +#PBS -l walltime=10:00:00 +#PBS -l mem=750gb +# ask for 1 node, 1 cpu (not more needed since we don't have parallelism) +#PBS -l nodes=1:ppn=1 +#PBS -N suffix_array_construction_uniprot + +# define output and error files +#PBS -o stdout.$PBS_JOBID +#PBS -e stderr.$PBS_JOBID + +prefix="/kyukon/data/gent/vo/000/gvo00038/suffix-array" + +# load Rust +module load Rust/1.75.0-GCCcore-12.3.0 +module load Clang/16.0.6-GCCcore-12.3.0 # needed to build the bindings from Rust to C +module load CMake/3.26.3-GCCcore-12.3.0 + +# go to current working dir and execute +cd $PBS_O_WORKDIR + +# compile +cargo build --release + +# execute +./target/release/sa-builder -d "$prefix"/proteins.tsv -t "$prefix"/taxons.tsv -o "$prefix"/sa_sparse3.bin -s 3 -a lib-div-suf-sort diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 812bd7d..ec608c3 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -3,7 +3,7 @@ use std::{ File, OpenOptions }, - io::Result + io, time::{SystemTime, SystemTimeError, UNIX_EPOCH} }; use clap::Parser; @@ -33,24 +33,27 @@ fn main() { eprintln!(); eprintln!("πŸ“‹ Started loading the taxon file..."); + let start_taxon_time = get_time_ms().unwrap(); let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); - eprintln!("βœ… Successfully loaded the taxon file!"); + eprintln!("βœ… Successfully loaded the taxon file in {} seconds!", (get_time_ms().unwrap() - start_taxon_time) / 1000.0); eprintln!("\tAggregation method: LCA*"); eprintln!(); eprintln!("πŸ“‹ Started loading the proteins..."); + let start_proteins_time = get_time_ms().unwrap(); let mut data = Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); - eprintln!("βœ… Successfully loaded the proteins!"); + eprintln!("βœ… Successfully loaded the proteins in {} seconds!", (get_time_ms().unwrap() - start_proteins_time) / 1000.0); eprintln!(); eprintln!("πŸ“‹ Started building the suffix array..."); + let start_ssa_time = get_time_ms().unwrap(); let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); - eprintln!("βœ… Successfully built the suffix array!"); + eprintln!("βœ… Successfully built the suffix array in {} seconds!", (get_time_ms().unwrap() - start_ssa_time) / 1000.0); eprintln!("\tAmount of items: {}", sa.len()); eprintln!("\tSample rate: {}", sparseness_factor); @@ -60,29 +63,30 @@ fn main() { eprintln!(); eprintln!("πŸ“‹ Started dumping the suffix array..."); + let start_dump_time = get_time_ms().unwrap(); if compress_sa { let bits_per_value = (data.len() as f64).log2().ceil() as usize; if let Err(err) = - dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) + dump_compressed_suffix_array(&sa, sparseness_factor, bits_per_value, &mut file) { eprint_and_exit(err.to_string().as_str()); }; - eprintln!("βœ… Successfully dumped the suffix array!"); + eprintln!("βœ… Successfully dumped the suffix array in {} seconds!", (get_time_ms().unwrap() - start_dump_time) / 1000.0); eprintln!("\tAmount of bits per item: {}", bits_per_value); } else { if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) { eprint_and_exit(err.to_string().as_str()); } - eprintln!("βœ… Successfully dumped the suffix array!"); + eprintln!("βœ… Successfully dumped the suffix array in {}Β seconds!", (get_time_ms().unwrap() - start_dump_time) / 1000.0); eprintln!("\tAmount of bits per item: 64"); } } -fn open_file(file: &str) -> Result { +fn open_file(file: &str) -> io::Result { OpenOptions::new() .create(true) .write(true) @@ -94,3 +98,7 @@ fn eprint_and_exit(err: &str) -> ! { eprintln!("{}", err); std::process::exit(1); } + +pub fn get_time_ms() -> Result { + Ok(SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos() as f64 * 1e-6) +} diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index 85a41df..24fce68 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -26,7 +26,7 @@ use sa_index::SuffixArray; /// /// Returns an error if writing to the writer fails. pub fn dump_compressed_suffix_array( - sa: Vec, + sa: &Vec, sparseness_factor: u8, bits_per_value: usize, writer: &mut impl Write @@ -146,7 +146,7 @@ mod tests { let sa = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; let mut writer = vec![]; - dump_compressed_suffix_array(sa, 1, 8, &mut writer).unwrap(); + dump_compressed_suffix_array(&sa, 1, 8, &mut writer).unwrap(); assert_eq!( writer, @@ -167,7 +167,7 @@ mod tests { valid_write_count: 0 }; - dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); + dump_compressed_suffix_array(&vec![], 1, 8, &mut writer).unwrap(); } #[test] @@ -177,7 +177,7 @@ mod tests { valid_write_count: 1 }; - dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); + dump_compressed_suffix_array(&vec![], 1, 8, &mut writer).unwrap(); } #[test] @@ -187,7 +187,7 @@ mod tests { valid_write_count: 2 }; - dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); + dump_compressed_suffix_array(&vec![], 1, 8, &mut writer).unwrap(); } #[test] @@ -197,7 +197,7 @@ mod tests { valid_write_count: 3 }; - dump_compressed_suffix_array(vec![1], 1, 8, &mut writer).unwrap(); + dump_compressed_suffix_array(&vec![1], 1, 8, &mut writer).unwrap(); } #[test]