Skip to content

Commit

Permalink
Write optimization (#23)
Browse files Browse the repository at this point in the history
* buffered writing

* build pbs scripts

* timings when building

* update requirements pbs

* fix formatting
  • Loading branch information
tibvdm authored Jun 26, 2024
1 parent a181fa9 commit 95f073f
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 17 deletions.
10 changes: 8 additions & 2 deletions libsais64-rs/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ use std::{
Display,
Formatter
},
path::{Path, PathBuf},
path::{
Path,
PathBuf
},
process::{
Command,
ExitStatus
Expand Down Expand Up @@ -58,7 +61,10 @@ fn exit_status_to_result(name: &str, exit_status: ExitStatus) -> Result<(), Comp

fn main() -> Result<(), Box<dyn Error>> {
// remove the old libsais folder
Command::new("rm").args(["-rf", "libsais"]).status().unwrap_or_default(); // if removing fails, it is since the folder did not exist, we just can ignore it
Command::new("rm")
.args(["-rf", "libsais"])
.status()
.unwrap_or_default(); // if removing fails, it is since the folder did not exist, we just can ignore it

// clone the c library
Command::new("git")
Expand Down
41 changes: 41 additions & 0 deletions sa-builder/build-compressed.pbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

#########################################################################################################
### This script is designed to run on the Ghent university HPC ###
### ###
### how to use: ###
### 1) Swap to the high-memory gallade cluster by executing `module swap cluster/gallade` ###
### 2) Navigate the to root of the project ###
### 3) Submit the job to the queue with `qsub sa-builder/build-compressed.pbs` ###
#########################################################################################################

# go to cluster with high memory
module swap cluster/gallade

# define requested memory, cpu resources and email notifications
#PBS -m abe
#PBS -l walltime=6:00:00
#PBS -l mem=750gb
# ask for 1 node, 1 cpu (not more needed since we don't have parallelism)
#PBS -l nodes=1:ppn=all
#PBS -N suffix_array_construction_uniprot

# define output and error files
#PBS -o stdout.$PBS_JOBID
#PBS -e stderr.$PBS_JOBID

prefix="/kyukon/data/gent/vo/000/gvo00038/suffix-array"

# load Rust
module load Rust/1.75.0-GCCcore-12.3.0
module load Clang/16.0.6-GCCcore-12.3.0 # needed to build the bindings from Rust to C
module load CMake/3.26.3-GCCcore-12.3.0

# go to current working dir and execute
cd $PBS_O_WORKDIR

# compile
cargo build --release

# execute
./target/release/sa-builder -d "$prefix"/proteins.tsv -t "$prefix"/taxons.tsv -o "$prefix"/sa_sparse3_compressed.bin -s 3 -a lib-div-suf-sort -c
8 changes: 4 additions & 4 deletions sa-builder/build_suffix_array.pbs → sa-builder/build.pbs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
### how to use: ###
### 1) Swap to the high-memory gallade cluster by executing `module swap cluster/gallade` ###
### 2) Navigate the to root of the project ###
### 3) Submit the job to the queue with `qsub suffixarray/build_suffix_array.pbs` ###
### 3) Submit the job to the queue with `qsub sa-builder/build.pbs` ###
#########################################################################################################

# go to cluster with high memory
Expand All @@ -17,14 +17,14 @@ module swap cluster/gallade
#PBS -l walltime=10:00:00
#PBS -l mem=750gb
# ask for 1 node, 1 cpu (not more needed since we don't have parallelism)
#PBS -l nodes=1:ppn=1
#PBS -l nodes=1:ppn=all
#PBS -N suffix_array_construction_uniprot

# define output and error files
#PBS -o stdout.$PBS_JOBID
#PBS -e stderr.$PBS_JOBID

prefix="$VSC_DATA_VO/bram/"
prefix="/kyukon/data/gent/vo/000/gvo00038/suffix-array"

# load Rust
module load Rust/1.75.0-GCCcore-12.3.0
Expand All @@ -38,4 +38,4 @@ cd $PBS_O_WORKDIR
cargo build --release

# execute
./target/release/suffixarray_builder -d "$prefix"uniprot_protein_database_minimal.tsv -t "$prefix"taxons.tsv --sparseness-factor 3 --construction-algorithm lib-div-suf-sort -o "$prefix"uniprot_suffix_array_sparse3.bin
./target/release/sa-builder -d "$prefix"/proteins.tsv -t "$prefix"/taxons.tsv -o "$prefix"/sa_sparse3.bin -s 3 -a lib-div-suf-sort
52 changes: 41 additions & 11 deletions sa-builder/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ use std::{
File,
OpenOptions
},
io::Result
io::BufWriter,
time::{
SystemTime,
SystemTimeError,
UNIX_EPOCH
}
};

use clap::Parser;
Expand Down Expand Up @@ -33,33 +38,46 @@ fn main() {

eprintln!();
eprintln!("📋 Started loading the taxon file...");
let start_taxon_time = get_time_ms().unwrap();
let taxon_id_calculator =
TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!("✅ Successfully loaded the taxon file!");
eprintln!(
"✅ Successfully loaded the taxon file in {} seconds!",
(get_time_ms().unwrap() - start_taxon_time) / 1000.0
);
eprintln!("\tAggregation method: LCA*");

eprintln!();
eprintln!("📋 Started loading the proteins...");
let start_proteins_time = get_time_ms().unwrap();
let mut data =
Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!("✅ Successfully loaded the proteins!");
eprintln!(
"✅ Successfully loaded the proteins in {} seconds!",
(get_time_ms().unwrap() - start_proteins_time) / 1000.0
);

eprintln!();
eprintln!("📋 Started building the suffix array...");
let start_ssa_time = get_time_ms().unwrap();
let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!("✅ Successfully built the suffix array!");
eprintln!(
"✅ Successfully built the suffix array in {} seconds!",
(get_time_ms().unwrap() - start_ssa_time) / 1000.0
);
eprintln!("\tAmount of items: {}", sa.len());
eprintln!("\tSample rate: {}", sparseness_factor);

// open the output file
let mut file =
open_file(&output).unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
let mut file = open_file_buffer(&output, 100 * 1024 * 1024)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));

eprintln!();
eprintln!("📋 Started dumping the suffix array...");
let start_dump_time = get_time_ms().unwrap();

if compress_sa {
let bits_per_value = (data.len() as f64).log2().ceil() as usize;
Expand All @@ -70,27 +88,39 @@ fn main() {
eprint_and_exit(err.to_string().as_str());
};

eprintln!("✅ Successfully dumped the suffix array!");
eprintln!(
"✅ Successfully dumped the suffix array in {} seconds!",
(get_time_ms().unwrap() - start_dump_time) / 1000.0
);
eprintln!("\tAmount of bits per item: {}", bits_per_value);
} else {
if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) {
eprint_and_exit(err.to_string().as_str());
}

eprintln!("✅ Successfully dumped the suffix array!");
eprintln!(
"✅ Successfully dumped the suffix array in {} seconds!",
(get_time_ms().unwrap() - start_dump_time) / 1000.0
);
eprintln!("\tAmount of bits per item: 64");
}
}

fn open_file(file: &str) -> Result<File> {
OpenOptions::new()
fn open_file_buffer(file: &str, buffer_size: usize) -> std::io::Result<BufWriter<File>> {
let file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true) // if the file already exists, empty the file
.open(file)
.open(file)?;

Ok(BufWriter::with_capacity(buffer_size, file))
}

fn eprint_and_exit(err: &str) -> ! {
eprintln!("{}", err);
std::process::exit(1);
}

pub fn get_time_ms() -> Result<f64, SystemTimeError> {
Ok(SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos() as f64 * 1e-6)
}

0 comments on commit 95f073f

Please sign in to comment.