diff --git a/include/hibf/layout/compute_layout.hpp b/include/hibf/layout/compute_layout.hpp index eede2db1..2de65a43 100644 --- a/include/hibf/layout/compute_layout.hpp +++ b/include/hibf/layout/compute_layout.hpp @@ -20,6 +20,7 @@ namespace seqan::hibf::layout * \param[in] config The configuration to compute the layout with. * \param[in] kmer_counts The vector that will store the kmer counts (estimations). * \param[in] sketches The vector that will store the sketches. + * \param[in] positions The vector that will store the positions. * \param[in,out] union_estimation_timer The timer that measures the union estimation time. * \param[in,out] rearrangement_timer The timer that measures the rearrangement time. * \returns layout @@ -27,6 +28,7 @@ namespace seqan::hibf::layout layout compute_layout(config const & config, std::vector const & kmer_counts, std::vector const & sketches, + std::vector && positions, concurrent_timer & union_estimation_timer, concurrent_timer & rearrangement_timer); diff --git a/include/hibf/misc/partition.hpp b/include/hibf/misc/partition.hpp new file mode 100644 index 00000000..c1960b44 --- /dev/null +++ b/include/hibf/misc/partition.hpp @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include + +#include + +namespace seqan::hibf +{ + +struct partition_toolbox +{ + partition_toolbox(partition_toolbox const &) = default; + partition_toolbox(partition_toolbox &&) = default; + partition_toolbox & operator=(partition_toolbox const &) = default; + partition_toolbox & operator=(partition_toolbox &&) = default; + ~partition_toolbox() = default; + + explicit partition_toolbox(size_t const parts) : partitions{parts} + { + size_t const suffixes = next_power_of_four(partitions); + size_t const suffixes_per_part = suffixes / partitions; + mask = suffixes - 1; + shift_value = std::countr_zero(suffixes_per_part); + } + + size_t partitions{}; + size_t mask{}; + int shift_value{}; + + // The number of suffixes is a power of four. + // The number of partitions is a power of two. + // The number of suffixes is greater than or equal to the number of partitions. + // This means that the number of suffixes per partition is always a power of two. + // Therefore, we can do a right shift instead of the division in: + // (hash & mask) / suffixes_per_part == partition + // The compiler cannot optimize the division to a right shift because suffixes_per_part is a runtime variable. + constexpr size_t hash_partition(uint64_t const hash) const + { + return (hash & mask) >> shift_value; + } + + static constexpr size_t next_power_of_four(size_t number) + { + if (number == 0ULL || number == 1ULL) + return 1ULL; // GCOVR_EXCL_LINE + + --number; + int const highest_set_bit_pos = std::bit_width(number); + int const shift_amount = (highest_set_bit_pos + (highest_set_bit_pos & 1)) - 2; + // ( Next multiple of two ) 4 has two zeros + return 0b0100ULL << shift_amount; + } +}; + +} // namespace seqan::hibf diff --git a/include/hibf/sketch/compute_sketches.hpp b/include/hibf/sketch/compute_sketches.hpp index 977609e6..00a09965 100644 --- a/include/hibf/sketch/compute_sketches.hpp +++ b/include/hibf/sketch/compute_sketches.hpp @@ -23,4 +23,9 @@ void compute_sketches(config const & config, std::vector & kmer_counts, std::vector & sketches); +void compute_sketches(config const & config, + std::vector> & kmer_counts, + std::vector> & sketches, + size_t const number_of_partitions); + } // namespace seqan::hibf::sketch diff --git a/src/hierarchical_interleaved_bloom_filter.cpp b/src/hierarchical_interleaved_bloom_filter.cpp index 0044c5aa..5571fd20 100644 --- a/src/hierarchical_interleaved_bloom_filter.cpp +++ b/src/hierarchical_interleaved_bloom_filter.cpp @@ -218,10 +218,19 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con return count == 0u; })); + std::vector positions = [&kmer_counts]() + { + std::vector ps; + ps.resize(kmer_counts.size()); + std::iota(ps.begin(), ps.end(), 0); + return ps; + }(); + layout_dp_algorithm_timer.start(); auto layout = layout::compute_layout(configuration, kmer_counts, sketches, + std::move(positions), layout_union_estimation_timer, layout_rearrangement_timer); layout_dp_algorithm_timer.stop(); diff --git a/src/layout/compute_layout.cpp b/src/layout/compute_layout.cpp index a3ddef59..08166d98 100644 --- a/src/layout/compute_layout.cpp +++ b/src/layout/compute_layout.cpp @@ -24,9 +24,15 @@ namespace seqan::hibf::layout layout compute_layout(config const & config, std::vector const & kmer_counts, std::vector const & sketches, + std::vector && positions, concurrent_timer & union_estimation_timer, concurrent_timer & rearrangement_timer) { + assert(kmer_counts.size() == sketches.size()); + assert(positions.size() <= sketches.size()); + assert(sketches.size() == config.number_of_user_bins); + assert(*std::ranges::max_element(positions) <= config.number_of_user_bins); + layout resulting_layout{}; // The output streams facilitate writing the layout file in hierarchical structure. @@ -37,7 +43,8 @@ layout compute_layout(config const & config, data_store store{.false_positive_rate = config.maximum_fpr, .hibf_layout = &resulting_layout, .kmer_counts = std::addressof(kmer_counts), - .sketches = std::addressof(sketches)}; + .sketches = std::addressof(sketches), + .positions = std::move(positions)}; store.fpr_correction = compute_fpr_correction({.fpr = config.maximum_fpr, // .hash_count = config.number_of_hash_functions, @@ -66,7 +73,20 @@ layout compute_layout(config const & config, concurrent_timer union_estimation_timer; concurrent_timer rearrangement_timer; - return compute_layout(config, kmer_counts, sketches, union_estimation_timer, rearrangement_timer); + std::vector positions = [&kmer_counts]() + { + std::vector ps; + ps.resize(kmer_counts.size()); + std::iota(ps.begin(), ps.end(), 0); + return ps; + }(); + + return compute_layout(config, + kmer_counts, + sketches, + std::move(positions), + union_estimation_timer, + rearrangement_timer); } } // namespace seqan::hibf::layout diff --git a/src/layout/layout.cpp b/src/layout/layout.cpp index 8db5ecd9..443dfb6f 100644 --- a/src/layout/layout.cpp +++ b/src/layout/layout.cpp @@ -121,8 +121,10 @@ void seqan::hibf::layout::layout::read_from(std::istream & stream) assert(line == prefix::layout_column_names); - // parse the rest of the file - while (std::getline(stream, line)) + // parse the rest of the file until either + // 1) the end of the file is reached + // 2) Another header line starts, which indicates a partitioned layout + while (stream.good() && static_cast(stream.peek()) != prefix::layout_header[0] && std::getline(stream, line)) user_bins.emplace_back(parse_layout_line(line)); } diff --git a/src/sketch/compute_sketches.cpp b/src/sketch/compute_sketches.cpp index cea2e0a9..b39dae67 100644 --- a/src/sketch/compute_sketches.cpp +++ b/src/sketch/compute_sketches.cpp @@ -7,8 +7,9 @@ #include // for function #include // for vector -#include // for config, insert_iterator -#include // for unordered_flat_set +#include // for config, insert_iterator +#include // for unordered_flat_set +#include #include // for compute_sketches #include // for estimate_kmer_counts #include // for hyperloglog @@ -43,4 +44,35 @@ void compute_sketches(config const & config, sketch::estimate_kmer_counts(sketches, kmer_counts); } +void compute_sketches(config const & config, + std::vector> & cardinalities, + std::vector> & sketches, + size_t const number_of_partitions) +{ + partition_toolbox partition_helper{number_of_partitions}; + + sketches.resize(number_of_partitions); + cardinalities.resize(number_of_partitions); + + for (size_t i = 0; i < number_of_partitions; ++i) + { + sketches[i].resize(config.number_of_user_bins, seqan::hibf::sketch::hyperloglog(config.sketch_bits)); + cardinalities[i].resize(config.number_of_user_bins); + } + + robin_hood::unordered_flat_set kmers; +#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers) + for (size_t i = 0; i < config.number_of_user_bins; ++i) + { + kmers.clear(); + config.input_fn(i, insert_iterator{kmers}); + + for (auto k_hash : kmers) + sketches[partition_helper.hash_partition(k_hash)][i].add(k_hash); + } + + for (size_t i = 0; i < number_of_partitions; ++i) + sketch::estimate_kmer_counts(sketches[i], cardinalities[i]); +} + } // namespace seqan::hibf::sketch diff --git a/test/unit/hibf/layout/compute_layout_test.cpp b/test/unit/hibf/layout/compute_layout_test.cpp index 15b8d306..6a17cce8 100644 --- a/test/unit/hibf/layout/compute_layout_test.cpp +++ b/test/unit/hibf/layout/compute_layout_test.cpp @@ -6,6 +6,7 @@ #include // for size_t #include // for function +#include // for iota #include // for vector, allocator #include // for insert_iterator, config @@ -39,8 +40,20 @@ TEST(compute_layout, dispatch) seqan::hibf::concurrent_timer union_estimation_timer{}; seqan::hibf::concurrent_timer rearrangement_timer{}; - auto layout2 = - seqan::hibf::layout::compute_layout(config, kmer_counts, sketches, union_estimation_timer, rearrangement_timer); + std::vector positions = [&kmer_counts]() + { + std::vector ps; + ps.resize(kmer_counts.size()); + std::iota(ps.begin(), ps.end(), 0); + return ps; + }(); + + auto layout2 = seqan::hibf::layout::compute_layout(config, + kmer_counts, + sketches, + std::move(positions), + union_estimation_timer, + rearrangement_timer); EXPECT_TRUE(layout1 == layout2); } diff --git a/test/unit/hibf/layout/layout_test.cpp b/test/unit/hibf/layout/layout_test.cpp index ad388bf1..031efca3 100644 --- a/test/unit/hibf/layout/layout_test.cpp +++ b/test/unit/hibf/layout/layout_test.cpp @@ -116,3 +116,48 @@ TEST(layout_test, clear) EXPECT_TRUE(layout.max_bins.empty()); EXPECT_TRUE(layout.user_bins.empty()); } + +TEST(layout_test, read_from_partitioned_layout) +{ + // layout consists of three partitions, written one after the other + std::stringstream ss{R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +)layout_file"}; + + for (size_t i = 0; i < 3; ++i) + { + seqan::hibf::layout::layout layout; + layout.read_from(ss); + + EXPECT_EQ(layout.top_level_max_bin_id, 111); + EXPECT_EQ(layout.max_bins[0], (seqan::hibf::layout::layout::max_bin{{0}, 0})); + EXPECT_EQ(layout.max_bins[1], (seqan::hibf::layout::layout::max_bin{{2}, 2})); + EXPECT_EQ(layout.max_bins[2], (seqan::hibf::layout::layout::max_bin{{1, 2, 3, 4}, 22})); + EXPECT_EQ(layout.user_bins[0], (seqan::hibf::layout::layout::user_bin{std::vector{}, 0, 1, 7})); + EXPECT_EQ(layout.user_bins[1], (seqan::hibf::layout::layout::user_bin{std::vector{1}, 0, 22, 4})); + EXPECT_EQ(layout.user_bins[2], + (seqan::hibf::layout::layout::user_bin{std::vector{1, 2, 3, 4}, 22, 21, 5})); + } +}