Skip to content

Commit

Permalink
wip on partitioning the HIBF
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Nov 9, 2023
1 parent 64a6eac commit b4dc08a
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 52 deletions.
10 changes: 10 additions & 0 deletions include/chopper/configuration.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,16 @@ struct configuration
uint8_t window_size{k};
//!\}

/*!\name Partitioned HIBF configuration
* \{
*/
//!\brief The maximum index size that the HIBF should not exceed. number_of_paritions will be set accordingly.
size_t maximum_index_size{0};

//!\brief The number of partitions for the HIBF index.
size_t number_of_partitions{0};
//!\}

/*!\name Configuration of size estimates
* \{
*/
Expand Down
195 changes: 143 additions & 52 deletions src/layout/execute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <filesystem>
#include <fstream>
#include <iostream>
#include <numeric>
#include <stdexcept>
#include <string>
#include <tuple>
Expand All @@ -27,8 +28,11 @@

#include <hibf/layout/compute_layout.hpp>
#include <hibf/layout/layout.hpp>
#include <hibf/misc/divide_and_ceil.hpp>
#include <hibf/sketch/toolbox.hpp>
#include <hibf/sketch/compute_sketches.hpp>
#include <hibf/sketch/hyperloglog.hpp>
#include <hibf/sketch/toolbox.hpp>

namespace chopper::layout
{
Expand Down Expand Up @@ -58,71 +62,158 @@ int execute(chopper::configuration & config, std::vector<std::string> const & fi
<< "anyway, so we increased your number of technical bins to " << config.hibf_config.tmax << ".\n";
}

seqan::hibf::layout::layout hibf_layout;
std::vector<seqan::hibf::sketch::hyperloglog> sketches;
if (config.number_of_partitions < 2) // 0 == unset == single HIBF, 1 == single HIBF
{
seqan::hibf::layout::layout hibf_layout;
std::vector<seqan::hibf::sketch::hyperloglog> sketches;

seqan::hibf::concurrent_timer compute_sketches_timer{};
seqan::hibf::concurrent_timer union_estimation_timer{};
seqan::hibf::concurrent_timer rearrangement_timer{};
seqan::hibf::concurrent_timer dp_algorithm_timer{};
seqan::hibf::concurrent_timer compute_sketches_timer{};
seqan::hibf::concurrent_timer union_estimation_timer{};
seqan::hibf::concurrent_timer rearrangement_timer{};
seqan::hibf::concurrent_timer dp_algorithm_timer{};

if (config.determine_best_tmax)
{
std::tie(hibf_layout, sketches) = determine_best_number_of_technical_bins(config);
if (config.determine_best_tmax)
{
std::tie(hibf_layout, sketches) = determine_best_number_of_technical_bins(config);
}
else
{
std::vector<size_t> kmer_counts;

compute_sketches_timer.start();
seqan::hibf::sketch::compute_sketches(config.hibf_config, kmer_counts, sketches);
compute_sketches_timer.stop();

std::vector<size_t> positions = [&kmer_counts]()
{
std::vector<size_t> ps;
ps.resize(kmer_counts.size());
std::iota(ps.begin(), ps.end(), 0);
return ps;
}(); // GCOVR_EXCL_LINE

dp_algorithm_timer.start();
hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config,
kmer_counts,
sketches,
std::move(positions),
union_estimation_timer,
rearrangement_timer);
dp_algorithm_timer.stop();

if (config.output_verbose_statistics)
{
size_t dummy{};
chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts};
global_stats.hibf_layout = hibf_layout;
global_stats.print_header_to(std::cout);
global_stats.print_summary_to(dummy, std::cout);
}
}

if (!config.disable_sketch_output)
{
if (!std::filesystem::exists(config.sketch_directory))
std::filesystem::create_directory(config.sketch_directory);

assert(filenames.size() == sketches.size());
for (size_t i = 0; i < filenames.size(); ++i)
sketch::write_sketch_file(filenames[i], sketches[i], config);
}

// brief Write the output to the layout file.
std::ofstream fout{config.output_filename};
chopper::layout::write_user_bins_to(filenames, fout);
config.write_to(fout);
hibf_layout.write_to(fout);

if (!config.output_timings.empty())
{
std::ofstream output_stream{config.output_timings};
output_stream << std::fixed << std::setprecision(2);
output_stream << "sketching_in_seconds\t"
<< "layouting_in_seconds\t"
<< "union_estimation_in_seconds\t"
<< "rearrangement_in_seconds\n";
output_stream << compute_sketches_timer.in_seconds() << '\t';
output_stream << dp_algorithm_timer.in_seconds() << '\t';
output_stream << union_estimation_timer.in_seconds() << '\t';
output_stream << rearrangement_timer.in_seconds() << '\t';
}
}
else
{
std::vector<size_t> kmer_counts;
std::vector<size_t> cardinalities;
std::vector<seqan::hibf::sketch::hyperloglog> sketches;
std::vector<std::vector<size_t>> positions; // asign positions of user bins for each partition
std::vector<seqan::hibf::layout::layout> hibf_layouts(config.number_of_partitions); // multiple layouts
seqan::hibf::concurrent_timer compute_sketches_timer{};

// compute sketches of all user bins
compute_sketches_timer.start();
seqan::hibf::sketch::compute_sketches(config.hibf_config, kmer_counts, sketches);
seqan::hibf::sketch::compute_sketches(config.hibf_config, cardinalities, sketches);
compute_sketches_timer.stop();
dp_algorithm_timer.start();
hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config,
kmer_counts,
sketches,
union_estimation_timer,
rearrangement_timer);
dp_algorithm_timer.stop();

if (config.output_verbose_statistics)

size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), 0u); // can this overflow?
size_t const cardinality_per_part = seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions);

size_t current_cardinality{0u};
size_t current_part{0u};

std::vector<size_t> const sorted_positions = [&cardinalities]()
{
std::vector<size_t> ps;
ps.resize(cardinalities.size());
std::iota(ps.begin(), ps.end(), 0);
seqan::hibf::sketch::toolbox::sort_by_cardinalities(cardinalities, ps);
return ps;
}();

for (size_t const current_user_bin_id : sorted_positions)
{
size_t dummy{};
chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts};
global_stats.hibf_layout = hibf_layout;
global_stats.print_header_to(std::cout);
global_stats.print_summary_to(dummy, std::cout);
positions[current_part].push_back(current_user_bin_id);
current_cardinality += cardinalities[current_user_bin_id];

if (current_cardinality >= cardinality_per_part)
{
current_cardinality = 0;
++current_part;
}
}
}

if (!config.disable_sketch_output)
{
if (!std::filesystem::exists(config.sketch_directory))
std::filesystem::create_directory(config.sketch_directory);

assert(filenames.size() == sketches.size());
for (size_t i = 0; i < filenames.size(); ++i)
sketch::write_sketch_file(filenames[i], sketches[i], config);
}
#pragma omp parallel for schedule(dynamic) num_threads(config.hibf_config.threads)
for (size_t i = 0; i < config.number_of_partitions; ++i)
{
seqan::hibf::concurrent_timer union_estimation_timer{};
seqan::hibf::concurrent_timer rearrangement_timer{};
seqan::hibf::concurrent_timer dp_algorithm_timer{};

dp_algorithm_timer.start();
hibf_layouts[i] = seqan::hibf::layout::compute_layout(config.hibf_config, cardinalities, sketches, std::move(positions[i]), union_estimation_timer, rearrangement_timer);
dp_algorithm_timer.stop();

if (!config.output_timings.empty())
{
std::ofstream output_stream{config.output_timings};
output_stream << std::fixed << std::setprecision(2);
output_stream << "sketching_in_seconds\t"
<< "layouting_in_seconds\t"
<< "union_estimation_in_seconds\t"
<< "rearrangement_in_seconds\n";
output_stream << compute_sketches_timer.in_seconds() << '\t';
output_stream << dp_algorithm_timer.in_seconds() << '\t';
output_stream << union_estimation_timer.in_seconds() << '\t';
output_stream << rearrangement_timer.in_seconds() << '\t';
}
}

// brief Write the output to the layout file.
std::ofstream fout{config.output_filename};
chopper::layout::write_user_bins_to(filenames, fout);
config.write_to(fout);
hibf_layout.write_to(fout);
// brief Write the output to the layout file.
std::ofstream fout{config.output_filename};
chopper::layout::write_user_bins_to(filenames, fout);
config.write_to(fout);

if (!config.output_timings.empty())
{
std::ofstream output_stream{config.output_timings};
output_stream << std::fixed << std::setprecision(2);
output_stream << "sketching_in_seconds\t"
<< "layouting_in_seconds\t"
<< "union_estimation_in_seconds\t"
<< "rearrangement_in_seconds\n";
output_stream << compute_sketches_timer.in_seconds() << '\t';
output_stream << dp_algorithm_timer.in_seconds() << '\t';
output_stream << union_estimation_timer.in_seconds() << '\t';
output_stream << rearrangement_timer.in_seconds() << '\t';
for (size_t i = 0; i < config.number_of_partitions; ++i)
hibf_layouts[i].write_to(fout);
}

return 0;
Expand Down
21 changes: 21 additions & 0 deletions src/set_up_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,34 @@ void set_up_parser(sharg::parser & parser, configuration & config)
"accuracy.",
.default_message = "k-mer size",
});

parser.add_option(config.output_timings,
sharg::config{.short_id = '\0',
.long_id = "timing-output",
.description = "Write time and memory usage to specified file (TSV format). ",
.default_message = "",
.validator = sharg::output_file_validator{}});

parser.add_option(
config.maximum_index_size,
sharg::config{
.short_id = '\0',
.long_id = "maximum-index-size",
.description =
"You can restrict the hibf index to have a maximum index size which will partition the index into "
"several partitions if needed. The number of partitions is computed based on your input data. "
"you can manually set the number of partitions with --number-of-partitions"});

parser.add_option(
config.number_of_partitions,
sharg::config{
.short_id = '\0',
.long_id = "number-of-paritions",
.description =
"The number of partitions of the HIBF. We recommend to instead use the option maximum-index-size if "
"your goal is to reduce the index size and thereby peak mempry usage of searching with the HIBF.",
.advanced = true});

parser.add_option(
config.hibf_config.tmax,
sharg::config{
Expand Down

0 comments on commit b4dc08a

Please sign in to comment.