Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write optimization #23

Merged
merged 5 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions libsais64-rs/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ use std::{
Display,
Formatter
},
path::{Path, PathBuf},
path::{
Path,
PathBuf
},
process::{
Command,
ExitStatus
Expand Down Expand Up @@ -58,7 +61,10 @@ fn exit_status_to_result(name: &str, exit_status: ExitStatus) -> Result<(), Comp

fn main() -> Result<(), Box<dyn Error>> {
// remove the old libsais folder
Command::new("rm").args(["-rf", "libsais"]).status().unwrap_or_default(); // if removing fails, it is since the folder did not exist, we just can ignore it
Command::new("rm")
.args(["-rf", "libsais"])
.status()
.unwrap_or_default(); // if removing fails, it is since the folder did not exist, we just can ignore it

// clone the c library
Command::new("git")
Expand Down
41 changes: 41 additions & 0 deletions sa-builder/build-compressed.pbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

#########################################################################################################
### This script is designed to run on the Ghent university HPC ###
### ###
### how to use: ###
### 1) Swap to the high-memory gallade cluster by executing `module swap cluster/gallade` ###
### 2) Navigate the to root of the project ###
### 3) Submit the job to the queue with `qsub sa-builder/build-compressed.pbs` ###
#########################################################################################################

# go to cluster with high memory
module swap cluster/gallade

# define requested memory, cpu resources and email notifications
#PBS -m abe
#PBS -l walltime=6:00:00
#PBS -l mem=750gb
# ask for 1 node, 1 cpu (not more needed since we don't have parallelism)
#PBS -l nodes=1:ppn=all
#PBS -N suffix_array_construction_uniprot

# define output and error files
#PBS -o stdout.$PBS_JOBID
#PBS -e stderr.$PBS_JOBID

prefix="/kyukon/data/gent/vo/000/gvo00038/suffix-array"

# load Rust
module load Rust/1.75.0-GCCcore-12.3.0
module load Clang/16.0.6-GCCcore-12.3.0 # needed to build the bindings from Rust to C
module load CMake/3.26.3-GCCcore-12.3.0

# go to current working dir and execute
cd $PBS_O_WORKDIR

# compile
cargo build --release

# execute
./target/release/sa-builder -d "$prefix"/proteins.tsv -t "$prefix"/taxons.tsv -o "$prefix"/sa_sparse3_compressed.bin -s 3 -a lib-div-suf-sort -c
8 changes: 4 additions & 4 deletions sa-builder/build_suffix_array.pbs → sa-builder/build.pbs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
### how to use: ###
### 1) Swap to the high-memory gallade cluster by executing `module swap cluster/gallade` ###
### 2) Navigate the to root of the project ###
### 3) Submit the job to the queue with `qsub suffixarray/build_suffix_array.pbs` ###
### 3) Submit the job to the queue with `qsub sa-builder/build.pbs` ###
#########################################################################################################

# go to cluster with high memory
Expand All @@ -17,14 +17,14 @@ module swap cluster/gallade
#PBS -l walltime=10:00:00
#PBS -l mem=750gb
# ask for 1 node, 1 cpu (not more needed since we don't have parallelism)
#PBS -l nodes=1:ppn=1
#PBS -l nodes=1:ppn=all
#PBS -N suffix_array_construction_uniprot

# define output and error files
#PBS -o stdout.$PBS_JOBID
#PBS -e stderr.$PBS_JOBID

prefix="$VSC_DATA_VO/bram/"
prefix="/kyukon/data/gent/vo/000/gvo00038/suffix-array"

# load Rust
module load Rust/1.75.0-GCCcore-12.3.0
Expand All @@ -38,4 +38,4 @@ cd $PBS_O_WORKDIR
cargo build --release

# execute
./target/release/suffixarray_builder -d "$prefix"uniprot_protein_database_minimal.tsv -t "$prefix"taxons.tsv --sparseness-factor 3 --construction-algorithm lib-div-suf-sort -o "$prefix"uniprot_suffix_array_sparse3.bin
./target/release/sa-builder -d "$prefix"/proteins.tsv -t "$prefix"/taxons.tsv -o "$prefix"/sa_sparse3.bin -s 3 -a lib-div-suf-sort
52 changes: 41 additions & 11 deletions sa-builder/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ use std::{
File,
OpenOptions
},
io::Result
io::BufWriter,
time::{
SystemTime,
SystemTimeError,
UNIX_EPOCH
}
};

use clap::Parser;
Expand Down Expand Up @@ -33,33 +38,46 @@ fn main() {

eprintln!();
eprintln!("📋 Started loading the taxon file...");
let start_taxon_time = get_time_ms().unwrap();
let taxon_id_calculator =
TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!("✅ Successfully loaded the taxon file!");
eprintln!(
"✅ Successfully loaded the taxon file in {} seconds!",
(get_time_ms().unwrap() - start_taxon_time) / 1000.0
);
eprintln!("\tAggregation method: LCA*");

eprintln!();
eprintln!("📋 Started loading the proteins...");
let start_proteins_time = get_time_ms().unwrap();
let mut data =
Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!("✅ Successfully loaded the proteins!");
eprintln!(
"✅ Successfully loaded the proteins in {} seconds!",
(get_time_ms().unwrap() - start_proteins_time) / 1000.0
);

eprintln!();
eprintln!("📋 Started building the suffix array...");
let start_ssa_time = get_time_ms().unwrap();
let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!("✅ Successfully built the suffix array!");
eprintln!(
"✅ Successfully built the suffix array in {} seconds!",
(get_time_ms().unwrap() - start_ssa_time) / 1000.0
);
eprintln!("\tAmount of items: {}", sa.len());
eprintln!("\tSample rate: {}", sparseness_factor);

// open the output file
let mut file =
open_file(&output).unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
let mut file = open_file_buffer(&output, 100 * 1024 * 1024)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));

eprintln!();
eprintln!("📋 Started dumping the suffix array...");
let start_dump_time = get_time_ms().unwrap();

if compress_sa {
let bits_per_value = (data.len() as f64).log2().ceil() as usize;
Expand All @@ -70,27 +88,39 @@ fn main() {
eprint_and_exit(err.to_string().as_str());
};

eprintln!("✅ Successfully dumped the suffix array!");
eprintln!(
"✅ Successfully dumped the suffix array in {} seconds!",
(get_time_ms().unwrap() - start_dump_time) / 1000.0
);
eprintln!("\tAmount of bits per item: {}", bits_per_value);
} else {
if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) {
eprint_and_exit(err.to_string().as_str());
}

eprintln!("✅ Successfully dumped the suffix array!");
eprintln!(
"✅ Successfully dumped the suffix array in {} seconds!",
(get_time_ms().unwrap() - start_dump_time) / 1000.0
);
eprintln!("\tAmount of bits per item: 64");
}
}

fn open_file(file: &str) -> Result<File> {
OpenOptions::new()
fn open_file_buffer(file: &str, buffer_size: usize) -> std::io::Result<BufWriter<File>> {
let file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true) // if the file already exists, empty the file
.open(file)
.open(file)?;

Ok(BufWriter::with_capacity(buffer_size, file))
}

fn eprint_and_exit(err: &str) -> ! {
eprintln!("{}", err);
std::process::exit(1);
}

pub fn get_time_ms() -> Result<f64, SystemTimeError> {
Ok(SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos() as f64 * 1e-6)
}
Loading