wip on partitioning the HIBF

seqan · Nov 9, 2023 · b4dc08a · b4dc08a
1 parent 64a6eac
commit b4dc08a
Show file tree

Hide file tree

Showing 3 changed files with 174 additions and 52 deletions.
diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp
@@ -43,6 +43,16 @@ struct configuration
     uint8_t window_size{k};
     //!\}
 
+    /*!\name Partitioned HIBF configuration
+     * \{
+     */
+    //!\brief The maximum index size that the HIBF should not exceed. number_of_paritions will be set accordingly.
+    size_t maximum_index_size{0};
+
+    //!\brief The number of partitions for the HIBF index.
+    size_t number_of_partitions{0};
+    //!\}
+
     /*!\name Configuration of size estimates
      * \{
      */

diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp
@@ -12,6 +12,7 @@
 #include <filesystem>
 #include <fstream>
 #include <iostream>
+#include <numeric>
 #include <stdexcept>
 #include <string>
 #include <tuple>
@@ -27,8 +28,11 @@
 
 #include <hibf/layout/compute_layout.hpp>
 #include <hibf/layout/layout.hpp>
+#include <hibf/misc/divide_and_ceil.hpp>
+#include <hibf/sketch/toolbox.hpp>
 #include <hibf/sketch/compute_sketches.hpp>
 #include <hibf/sketch/hyperloglog.hpp>
+#include <hibf/sketch/toolbox.hpp>
 
 namespace chopper::layout
 {
@@ -58,71 +62,158 @@ int execute(chopper::configuration & config, std::vector<std::string> const & fi
                   << "anyway, so we increased your number of technical bins to " << config.hibf_config.tmax << ".\n";
     }
 
-    seqan::hibf::layout::layout hibf_layout;
-    std::vector<seqan::hibf::sketch::hyperloglog> sketches;
+    if (config.number_of_partitions < 2) // 0 == unset == single HIBF, 1 == single HIBF
+    {
+        seqan::hibf::layout::layout hibf_layout;
+        std::vector<seqan::hibf::sketch::hyperloglog> sketches;
 
-    seqan::hibf::concurrent_timer compute_sketches_timer{};
-    seqan::hibf::concurrent_timer union_estimation_timer{};
-    seqan::hibf::concurrent_timer rearrangement_timer{};
-    seqan::hibf::concurrent_timer dp_algorithm_timer{};
+        seqan::hibf::concurrent_timer compute_sketches_timer{};
+        seqan::hibf::concurrent_timer union_estimation_timer{};
+        seqan::hibf::concurrent_timer rearrangement_timer{};
+        seqan::hibf::concurrent_timer dp_algorithm_timer{};
 
-    if (config.determine_best_tmax)
-    {
-        std::tie(hibf_layout, sketches) = determine_best_number_of_technical_bins(config);
+        if (config.determine_best_tmax)
+        {
+            std::tie(hibf_layout, sketches) = determine_best_number_of_technical_bins(config);
+        }
+        else
+        {
+            std::vector<size_t> kmer_counts;
+
+            compute_sketches_timer.start();
+            seqan::hibf::sketch::compute_sketches(config.hibf_config, kmer_counts, sketches);
+            compute_sketches_timer.stop();
+
+            std::vector<size_t> positions = [&kmer_counts]()
+            {
+                std::vector<size_t> ps;
+                ps.resize(kmer_counts.size());
+                std::iota(ps.begin(), ps.end(), 0);
+                return ps;
+            }(); // GCOVR_EXCL_LINE
+
+            dp_algorithm_timer.start();
+            hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config,
+                                                            kmer_counts,
+                                                            sketches,
+                                                            std::move(positions),
+                                                            union_estimation_timer,
+                                                            rearrangement_timer);
+            dp_algorithm_timer.stop();
+
+            if (config.output_verbose_statistics)
+            {
+                size_t dummy{};
+                chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts};
+                global_stats.hibf_layout = hibf_layout;
+                global_stats.print_header_to(std::cout);
+                global_stats.print_summary_to(dummy, std::cout);
+            }
+        }
+
+        if (!config.disable_sketch_output)
+        {
+            if (!std::filesystem::exists(config.sketch_directory))
+                std::filesystem::create_directory(config.sketch_directory);
+
+            assert(filenames.size() == sketches.size());
+            for (size_t i = 0; i < filenames.size(); ++i)
+                sketch::write_sketch_file(filenames[i], sketches[i], config);
+        }
+
+        // brief Write the output to the layout file.
+        std::ofstream fout{config.output_filename};
+        chopper::layout::write_user_bins_to(filenames, fout);
+        config.write_to(fout);
+        hibf_layout.write_to(fout);
+
+        if (!config.output_timings.empty())
+        {
+            std::ofstream output_stream{config.output_timings};
+            output_stream << std::fixed << std::setprecision(2);
+            output_stream << "sketching_in_seconds\t"
+                          << "layouting_in_seconds\t"
+                          << "union_estimation_in_seconds\t"
+                          << "rearrangement_in_seconds\n";
+            output_stream << compute_sketches_timer.in_seconds() << '\t';
+            output_stream << dp_algorithm_timer.in_seconds() << '\t';
+            output_stream << union_estimation_timer.in_seconds() << '\t';
+            output_stream << rearrangement_timer.in_seconds() << '\t';
+        }
     }
     else
     {
-        std::vector<size_t> kmer_counts;
+        std::vector<size_t> cardinalities;
+        std::vector<seqan::hibf::sketch::hyperloglog> sketches;
+        std::vector<std::vector<size_t>> positions; // asign positions of user bins for each partition
+        std::vector<seqan::hibf::layout::layout> hibf_layouts(config.number_of_partitions); // multiple layouts
+        seqan::hibf::concurrent_timer compute_sketches_timer{};
 
+        // compute sketches of all user bins
         compute_sketches_timer.start();
-        seqan::hibf::sketch::compute_sketches(config.hibf_config, kmer_counts, sketches);
+        seqan::hibf::sketch::compute_sketches(config.hibf_config, cardinalities, sketches);
         compute_sketches_timer.stop();
-        dp_algorithm_timer.start();
-        hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config,
-                                                          kmer_counts,
-                                                          sketches,
-                                                          union_estimation_timer,
-                                                          rearrangement_timer);
-        dp_algorithm_timer.stop();
-
-        if (config.output_verbose_statistics)
+
+        size_t const sum_of_cardinalities = std::accumulate(cardinalities.begin(), cardinalities.end(), 0u); // can this overflow?
+        size_t const cardinality_per_part = seqan::hibf::divide_and_ceil(sum_of_cardinalities, config.number_of_partitions);
+
+        size_t current_cardinality{0u};
+        size_t current_part{0u};
+
+        std::vector<size_t> const sorted_positions = [&cardinalities]()
+        {
+            std::vector<size_t> ps;
+            ps.resize(cardinalities.size());
+            std::iota(ps.begin(), ps.end(), 0);
+            seqan::hibf::sketch::toolbox::sort_by_cardinalities(cardinalities, ps);
+            return ps;
+        }();
+
+        for (size_t const current_user_bin_id : sorted_positions)
         {
-            size_t dummy{};
-            chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts};
-            global_stats.hibf_layout = hibf_layout;
-            global_stats.print_header_to(std::cout);
-            global_stats.print_summary_to(dummy, std::cout);
+            positions[current_part].push_back(current_user_bin_id);
+            current_cardinality += cardinalities[current_user_bin_id];
+
+            if (current_cardinality >= cardinality_per_part)
+            {
+                current_cardinality = 0;
+                ++current_part;
+            }
         }
-    }
 
-    if (!config.disable_sketch_output)
-    {
-        if (!std::filesystem::exists(config.sketch_directory))
-            std::filesystem::create_directory(config.sketch_directory);
-
-        assert(filenames.size() == sketches.size());
-        for (size_t i = 0; i < filenames.size(); ++i)
-            sketch::write_sketch_file(filenames[i], sketches[i], config);
-    }
+#pragma omp parallel for schedule(dynamic) num_threads(config.hibf_config.threads)
+        for (size_t i = 0; i < config.number_of_partitions; ++i)
+        {
+            seqan::hibf::concurrent_timer union_estimation_timer{};
+            seqan::hibf::concurrent_timer rearrangement_timer{};
+            seqan::hibf::concurrent_timer dp_algorithm_timer{};
+
+            dp_algorithm_timer.start();
+            hibf_layouts[i] = seqan::hibf::layout::compute_layout(config.hibf_config, cardinalities, sketches, std::move(positions[i]), union_estimation_timer, rearrangement_timer);
+            dp_algorithm_timer.stop();
+
+            if (!config.output_timings.empty())
+            {
+                std::ofstream output_stream{config.output_timings};
+                output_stream << std::fixed << std::setprecision(2);
+                output_stream << "sketching_in_seconds\t"
+                            << "layouting_in_seconds\t"
+                            << "union_estimation_in_seconds\t"
+                            << "rearrangement_in_seconds\n";
+                output_stream << compute_sketches_timer.in_seconds() << '\t';
+                output_stream << dp_algorithm_timer.in_seconds() << '\t';
+                output_stream << union_estimation_timer.in_seconds() << '\t';
+                output_stream << rearrangement_timer.in_seconds() << '\t';
+            }
+        }
 
-    // brief Write the output to the layout file.
-    std::ofstream fout{config.output_filename};
-    chopper::layout::write_user_bins_to(filenames, fout);
-    config.write_to(fout);
-    hibf_layout.write_to(fout);
+        // brief Write the output to the layout file.
+        std::ofstream fout{config.output_filename};
+        chopper::layout::write_user_bins_to(filenames, fout);
+        config.write_to(fout);
 
-    if (!config.output_timings.empty())
-    {
-        std::ofstream output_stream{config.output_timings};
-        output_stream << std::fixed << std::setprecision(2);
-        output_stream << "sketching_in_seconds\t"
-                      << "layouting_in_seconds\t"
-                      << "union_estimation_in_seconds\t"
-                      << "rearrangement_in_seconds\n";
-        output_stream << compute_sketches_timer.in_seconds() << '\t';
-        output_stream << dp_algorithm_timer.in_seconds() << '\t';
-        output_stream << union_estimation_timer.in_seconds() << '\t';
-        output_stream << rearrangement_timer.in_seconds() << '\t';
+        for (size_t i = 0; i < config.number_of_partitions; ++i)
+            hibf_layouts[i].write_to(fout);
     }
 
     return 0;

diff --git a/src/set_up_parser.cpp b/src/set_up_parser.cpp
@@ -78,13 +78,34 @@ void set_up_parser(sharg::parser & parser, configuration & config)
                 "accuracy.",
             .default_message = "k-mer size",
         });
+
     parser.add_option(config.output_timings,
                       sharg::config{.short_id = '\0',
                                     .long_id = "timing-output",
                                     .description = "Write time and memory usage to specified file (TSV format). ",
                                     .default_message = "",
                                     .validator = sharg::output_file_validator{}});
 
+    parser.add_option(
+        config.maximum_index_size,
+        sharg::config{
+            .short_id = '\0',
+            .long_id = "maximum-index-size",
+            .description =
+                "You can restrict the hibf index to have a maximum index size which will partition the index into "
+                "several partitions if needed. The number of partitions is computed based on your input data. "
+                "you can manually set the number of partitions with --number-of-partitions"});
+
+    parser.add_option(
+        config.number_of_partitions,
+        sharg::config{
+            .short_id = '\0',
+            .long_id = "number-of-paritions",
+            .description =
+                "The number of partitions of the HIBF. We recommend to instead use the option maximum-index-size if "
+                "your goal is to reduce the index size and thereby peak mempry usage of searching with the HIBF.",
+            .advanced = true});
+
     parser.add_option(
         config.hibf_config.tmax,
         sharg::config{