seqan · eseiler · Oct 25, 2023 · Oct 26, 2023 · Nov 9, 2023 · Dec 9, 2023
diff --git a/include/hibf/layout/compute_layout.hpp b/include/hibf/layout/compute_layout.hpp
@@ -20,13 +20,15 @@ namespace seqan::hibf::layout
  * \param[in] config The configuration to compute the layout with.
  * \param[in] kmer_counts The vector that will store the kmer counts (estimations).
  * \param[in] sketches The vector that will store the sketches.
+ * \param[in] positions The vector that will store the positions.
  * \param[in,out] union_estimation_timer The timer that measures the union estimation time.
  * \param[in,out] rearrangement_timer The timer that measures the rearrangement time.
  * \returns layout
  */
 layout compute_layout(config const & config,
                       std::vector<size_t> const & kmer_counts,
                       std::vector<sketch::hyperloglog> const & sketches,
+                      std::vector<size_t> && positions,
                       concurrent_timer & union_estimation_timer,
                       concurrent_timer & rearrangement_timer);
 

diff --git a/include/hibf/misc/partition.hpp b/include/hibf/misc/partition.hpp
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <bit>
+#include <cstddef>
+#include <cstdint>
+
+#include <hibf/platform.hpp>
+
+namespace seqan::hibf
+{
+
+struct partition_toolbox
+{
+    partition_toolbox(partition_toolbox const &) = default;
+    partition_toolbox(partition_toolbox &&) = default;
+    partition_toolbox & operator=(partition_toolbox const &) = default;
+    partition_toolbox & operator=(partition_toolbox &&) = default;
+    ~partition_toolbox() = default;
+
+    explicit partition_toolbox(size_t const parts) : partitions{parts}
+    {
+        size_t const suffixes = next_power_of_four(partitions);
+        size_t const suffixes_per_part = suffixes / partitions;
+        mask = suffixes - 1;
+        shift_value = std::countr_zero(suffixes_per_part);
+    }
+
+    size_t partitions{};
+    size_t mask{};
+    int shift_value{};
+
+    // The number of suffixes is a power of four.
+    // The number of partitions is a power of two.
+    // The number of suffixes is greater than or equal to the number of partitions.
+    // This means that the number of suffixes per partition is always a power of two.
+    // Therefore, we can do a right shift instead of the division in:
+    // (hash & mask) / suffixes_per_part == partition
+    // The compiler cannot optimize the division to a right shift because suffixes_per_part is a runtime variable.
+    constexpr size_t hash_partition(uint64_t const hash) const
+    {
+        return (hash & mask) >> shift_value;
+    }
+
+    static constexpr size_t next_power_of_four(size_t number)
+    {
+        if (number == 0ULL || number == 1ULL)
+            return 1ULL; // GCOVR_EXCL_LINE
+
+        --number;
+        int const highest_set_bit_pos = std::bit_width(number);
+        int const shift_amount = (highest_set_bit_pos + (highest_set_bit_pos & 1)) - 2;
+        //                       (           Next multiple of two                )   4 has two zeros
+        return 0b0100ULL << shift_amount;
+    }
+};
+
+} // namespace seqan::hibf
diff --git a/include/hibf/sketch/compute_sketches.hpp b/include/hibf/sketch/compute_sketches.hpp
@@ -23,4 +23,9 @@ void compute_sketches(config const & config,
                       std::vector<size_t> & kmer_counts,
                       std::vector<sketch::hyperloglog> & sketches);
 
+void compute_sketches(config const & config,
+                      std::vector<std::vector<size_t>> & kmer_counts,
+                      std::vector<std::vector<sketch::hyperloglog>> & sketches,
+                      size_t const number_of_partitions);
+
 } // namespace seqan::hibf::sketch
diff --git a/src/hierarchical_interleaved_bloom_filter.cpp b/src/hierarchical_interleaved_bloom_filter.cpp
@@ -218,10 +218,19 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con
                                     return count == 0u;
                                 }));
 
+    std::vector<size_t> positions = [&kmer_counts]()
+    {
+        std::vector<size_t> ps;
+        ps.resize(kmer_counts.size());
+        std::iota(ps.begin(), ps.end(), 0);
+        return ps;
+    }();
+
     layout_dp_algorithm_timer.start();
     auto layout = layout::compute_layout(configuration,
                                          kmer_counts,
                                          sketches,
+                                         std::move(positions),
                                          layout_union_estimation_timer,
                                          layout_rearrangement_timer);
     layout_dp_algorithm_timer.stop();

diff --git a/src/layout/compute_layout.cpp b/src/layout/compute_layout.cpp
@@ -24,9 +24,15 @@ namespace seqan::hibf::layout
 layout compute_layout(config const & config,
                       std::vector<size_t> const & kmer_counts,
                       std::vector<sketch::hyperloglog> const & sketches,
+                      std::vector<size_t> && positions,
                       concurrent_timer & union_estimation_timer,
                       concurrent_timer & rearrangement_timer)
 {
+    assert(kmer_counts.size() == sketches.size());
+    assert(positions.size() <= sketches.size());
+    assert(sketches.size() == config.number_of_user_bins);
+    assert(*std::ranges::max_element(positions) <= config.number_of_user_bins);
+
     layout resulting_layout{};
 
     // The output streams facilitate writing the layout file in hierarchical structure.
@@ -37,7 +43,8 @@ layout compute_layout(config const & config,
     data_store store{.false_positive_rate = config.maximum_fpr,
                      .hibf_layout = &resulting_layout,
                      .kmer_counts = std::addressof(kmer_counts),
-                     .sketches = std::addressof(sketches)};
+                     .sketches = std::addressof(sketches),
+                     .positions = std::move(positions)};
 
     store.fpr_correction = compute_fpr_correction({.fpr = config.maximum_fpr, //
                                                    .hash_count = config.number_of_hash_functions,
@@ -66,7 +73,20 @@ layout compute_layout(config const & config,
     concurrent_timer union_estimation_timer;
     concurrent_timer rearrangement_timer;
 
-    return compute_layout(config, kmer_counts, sketches, union_estimation_timer, rearrangement_timer);
+    std::vector<size_t> positions = [&kmer_counts]()
+    {
+        std::vector<size_t> ps;
+        ps.resize(kmer_counts.size());
+        std::iota(ps.begin(), ps.end(), 0);
+        return ps;
+    }();
+
+    return compute_layout(config,
+                          kmer_counts,
+                          sketches,
+                          std::move(positions),
+                          union_estimation_timer,
+                          rearrangement_timer);
 }
 
 } // namespace seqan::hibf::layout
diff --git a/src/layout/layout.cpp b/src/layout/layout.cpp
@@ -121,8 +121,10 @@ void seqan::hibf::layout::layout::read_from(std::istream & stream)
 
     assert(line == prefix::layout_column_names);
 
-    // parse the rest of the file
-    while (std::getline(stream, line))
+    // parse the rest of the file until either
+    // 1) the end of the file is reached
+    // 2) Another header line starts, which indicates a partitioned layout
+    while (stream.good() && static_cast<char>(stream.peek()) != prefix::layout_header[0] && std::getline(stream, line))
         user_bins.emplace_back(parse_layout_line(line));
 }
 

diff --git a/src/sketch/compute_sketches.cpp b/src/sketch/compute_sketches.cpp
@@ -7,8 +7,9 @@
 #include <functional> // for function
 #include <vector>     // for vector
 
-#include <hibf/config.hpp>                      // for config, insert_iterator
-#include <hibf/contrib/robin_hood.hpp>          // for unordered_flat_set
+#include <hibf/config.hpp>             // for config, insert_iterator
+#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
+#include <hibf/misc/partition.hpp>
 #include <hibf/sketch/compute_sketches.hpp>     // for compute_sketches
 #include <hibf/sketch/estimate_kmer_counts.hpp> // for estimate_kmer_counts
 #include <hibf/sketch/hyperloglog.hpp>          // for hyperloglog
@@ -43,4 +44,35 @@ void compute_sketches(config const & config,
     sketch::estimate_kmer_counts(sketches, kmer_counts);
 }
 
+void compute_sketches(config const & config,
+                      std::vector<std::vector<size_t>> & cardinalities,
+                      std::vector<std::vector<sketch::hyperloglog>> & sketches,
+                      size_t const number_of_partitions)
+{
+    partition_toolbox partition_helper{number_of_partitions};
+
+    sketches.resize(number_of_partitions);
+    cardinalities.resize(number_of_partitions);
+
+    for (size_t i = 0; i < number_of_partitions; ++i)
+    {
+        sketches[i].resize(config.number_of_user_bins, seqan::hibf::sketch::hyperloglog(config.sketch_bits));
+        cardinalities[i].resize(config.number_of_user_bins);
+    }
+
+    robin_hood::unordered_flat_set<uint64_t> kmers;
+#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers)
+    for (size_t i = 0; i < config.number_of_user_bins; ++i)
+    {
+        kmers.clear();
+        config.input_fn(i, insert_iterator{kmers});
+
+        for (auto k_hash : kmers)
+            sketches[partition_helper.hash_partition(k_hash)][i].add(k_hash);
+    }
+
+    for (size_t i = 0; i < number_of_partitions; ++i)
+        sketch::estimate_kmer_counts(sketches[i], cardinalities[i]);
+}
+
 } // namespace seqan::hibf::sketch
diff --git a/test/unit/hibf/layout/compute_layout_test.cpp b/test/unit/hibf/layout/compute_layout_test.cpp
@@ -6,6 +6,7 @@
 
 #include <cstddef>    // for size_t
 #include <functional> // for function
+#include <numeric>    // for iota
 #include <vector>     // for vector, allocator
 
 #include <hibf/config.hpp>                  // for insert_iterator, config
@@ -39,8 +40,20 @@ TEST(compute_layout, dispatch)
     seqan::hibf::concurrent_timer union_estimation_timer{};
     seqan::hibf::concurrent_timer rearrangement_timer{};
 
-    auto layout2 =
-        seqan::hibf::layout::compute_layout(config, kmer_counts, sketches, union_estimation_timer, rearrangement_timer);
+    std::vector<size_t> positions = [&kmer_counts]()
+    {
+        std::vector<size_t> ps;
+        ps.resize(kmer_counts.size());
+        std::iota(ps.begin(), ps.end(), 0);
+        return ps;
+    }();
+
+    auto layout2 = seqan::hibf::layout::compute_layout(config,
+                                                       kmer_counts,
+                                                       sketches,
+                                                       std::move(positions),
+                                                       union_estimation_timer,
+                                                       rearrangement_timer);
 
     EXPECT_TRUE(layout1 == layout2);
 }
diff --git a/test/unit/hibf/layout/layout_test.cpp b/test/unit/hibf/layout/layout_test.cpp
@@ -116,3 +116,48 @@ TEST(layout_test, clear)
     EXPECT_TRUE(layout.max_bins.empty());
     EXPECT_TRUE(layout.user_bins.empty());
 }
+
+TEST(layout_test, read_from_partitioned_layout)
+{
+    // layout consists of three partitions, written one after the other
+    std::stringstream ss{R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111
+#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
+#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
+#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
+#USER_BIN_IDX	TECHNICAL_BIN_INDICES	NUMBER_OF_TECHNICAL_BINS
+7	0	1
+4	1;0	1;22
+5	1;2;3;4;22	1;1;1;1;21
+#TOP_LEVEL_IBF fullest_technical_bin_idx:111
+#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
+#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
+#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
+#USER_BIN_IDX	TECHNICAL_BIN_INDICES	NUMBER_OF_TECHNICAL_BINS
+7	0	1
+4	1;0	1;22
+5	1;2;3;4;22	1;1;1;1;21
+#TOP_LEVEL_IBF fullest_technical_bin_idx:111
+#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
+#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
+#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
+#USER_BIN_IDX	TECHNICAL_BIN_INDICES	NUMBER_OF_TECHNICAL_BINS
+7	0	1
+4	1;0	1;22
+5	1;2;3;4;22	1;1;1;1;21
+)layout_file"};
+
+    for (size_t i = 0; i < 3; ++i)
+    {
+        seqan::hibf::layout::layout layout;
+        layout.read_from(ss);
+
+        EXPECT_EQ(layout.top_level_max_bin_id, 111);
+        EXPECT_EQ(layout.max_bins[0], (seqan::hibf::layout::layout::max_bin{{0}, 0}));
+        EXPECT_EQ(layout.max_bins[1], (seqan::hibf::layout::layout::max_bin{{2}, 2}));
+        EXPECT_EQ(layout.max_bins[2], (seqan::hibf::layout::layout::max_bin{{1, 2, 3, 4}, 22}));
+        EXPECT_EQ(layout.user_bins[0], (seqan::hibf::layout::layout::user_bin{std::vector<size_t>{}, 0, 1, 7}));
+        EXPECT_EQ(layout.user_bins[1], (seqan::hibf::layout::layout::user_bin{std::vector<size_t>{1}, 0, 22, 4}));
+        EXPECT_EQ(layout.user_bins[2],
+                  (seqan::hibf::layout::layout::user_bin{std::vector<size_t>{1, 2, 3, 4}, 22, 21, 5}));
+    }
+}