Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/hibf/layout/compute_layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ namespace seqan::hibf::layout
* \param[in] config The configuration to compute the layout with.
* \param[in] kmer_counts The vector that will store the kmer counts (estimations).
* \param[in] sketches The vector that will store the sketches.
* \param[in] positions The vector that will store the positions.
* \param[in,out] union_estimation_timer The timer that measures the union estimation time.
* \param[in,out] rearrangement_timer The timer that measures the rearrangement time.
* \returns layout
*/
layout compute_layout(config const & config,
std::vector<size_t> const & kmer_counts,
std::vector<sketch::hyperloglog> const & sketches,
std::vector<size_t> && positions,
concurrent_timer & union_estimation_timer,
concurrent_timer & rearrangement_timer);

Expand Down
61 changes: 61 additions & 0 deletions include/hibf/misc/partition.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <bit>
#include <cstddef>
#include <cstdint>

#include <hibf/platform.hpp>

namespace seqan::hibf
{

struct partition_toolbox
{
partition_toolbox(partition_toolbox const &) = default;
partition_toolbox(partition_toolbox &&) = default;
partition_toolbox & operator=(partition_toolbox const &) = default;
partition_toolbox & operator=(partition_toolbox &&) = default;
~partition_toolbox() = default;

explicit partition_toolbox(size_t const parts) : partitions{parts}
{
size_t const suffixes = next_power_of_four(partitions);
size_t const suffixes_per_part = suffixes / partitions;
mask = suffixes - 1;
shift_value = std::countr_zero(suffixes_per_part);
}

size_t partitions{};
size_t mask{};
int shift_value{};

// The number of suffixes is a power of four.
// The number of partitions is a power of two.
// The number of suffixes is greater than or equal to the number of partitions.
// This means that the number of suffixes per partition is always a power of two.
// Therefore, we can do a right shift instead of the division in:
// (hash & mask) / suffixes_per_part == partition
// The compiler cannot optimize the division to a right shift because suffixes_per_part is a runtime variable.
constexpr size_t hash_partition(uint64_t const hash) const
{
return (hash & mask) >> shift_value;
}

static constexpr size_t next_power_of_four(size_t number)
{
if (number == 0ULL || number == 1ULL)
return 1ULL; // GCOVR_EXCL_LINE

--number;
int const highest_set_bit_pos = std::bit_width(number);
int const shift_amount = (highest_set_bit_pos + (highest_set_bit_pos & 1)) - 2;
// ( Next multiple of two ) 4 has two zeros
return 0b0100ULL << shift_amount;
}
};

} // namespace seqan::hibf
5 changes: 5 additions & 0 deletions include/hibf/sketch/compute_sketches.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,9 @@ void compute_sketches(config const & config,
std::vector<size_t> & kmer_counts,
std::vector<sketch::hyperloglog> & sketches);

void compute_sketches(config const & config,
std::vector<std::vector<size_t>> & kmer_counts,
std::vector<std::vector<sketch::hyperloglog>> & sketches,
size_t const number_of_partitions);

} // namespace seqan::hibf::sketch
9 changes: 9 additions & 0 deletions src/hierarchical_interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,10 +218,19 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con
return count == 0u;
}));

std::vector<size_t> positions = [&kmer_counts]()
{
std::vector<size_t> ps;
ps.resize(kmer_counts.size());
std::iota(ps.begin(), ps.end(), 0);
return ps;
}();

layout_dp_algorithm_timer.start();
auto layout = layout::compute_layout(configuration,
kmer_counts,
sketches,
std::move(positions),
layout_union_estimation_timer,
layout_rearrangement_timer);
layout_dp_algorithm_timer.stop();
Expand Down
24 changes: 22 additions & 2 deletions src/layout/compute_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,15 @@ namespace seqan::hibf::layout
layout compute_layout(config const & config,
std::vector<size_t> const & kmer_counts,
std::vector<sketch::hyperloglog> const & sketches,
std::vector<size_t> && positions,
concurrent_timer & union_estimation_timer,
concurrent_timer & rearrangement_timer)
{
assert(kmer_counts.size() == sketches.size());
assert(positions.size() <= sketches.size());
assert(sketches.size() == config.number_of_user_bins);
assert(*std::ranges::max_element(positions) <= config.number_of_user_bins);

layout resulting_layout{};

// The output streams facilitate writing the layout file in hierarchical structure.
Expand All @@ -37,7 +43,8 @@ layout compute_layout(config const & config,
data_store store{.false_positive_rate = config.maximum_fpr,
.hibf_layout = &resulting_layout,
.kmer_counts = std::addressof(kmer_counts),
.sketches = std::addressof(sketches)};
.sketches = std::addressof(sketches),
.positions = std::move(positions)};

store.fpr_correction = compute_fpr_correction({.fpr = config.maximum_fpr, //
.hash_count = config.number_of_hash_functions,
Expand Down Expand Up @@ -66,7 +73,20 @@ layout compute_layout(config const & config,
concurrent_timer union_estimation_timer;
concurrent_timer rearrangement_timer;

return compute_layout(config, kmer_counts, sketches, union_estimation_timer, rearrangement_timer);
std::vector<size_t> positions = [&kmer_counts]()
{
std::vector<size_t> ps;
ps.resize(kmer_counts.size());
std::iota(ps.begin(), ps.end(), 0);
return ps;
}();

return compute_layout(config,
kmer_counts,
sketches,
std::move(positions),
union_estimation_timer,
rearrangement_timer);
}

} // namespace seqan::hibf::layout
6 changes: 4 additions & 2 deletions src/layout/layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,10 @@ void seqan::hibf::layout::layout::read_from(std::istream & stream)

assert(line == prefix::layout_column_names);

// parse the rest of the file
while (std::getline(stream, line))
// parse the rest of the file until either
// 1) the end of the file is reached
// 2) Another header line starts, which indicates a partitioned layout
while (stream.good() && static_cast<char>(stream.peek()) != prefix::layout_header[0] && std::getline(stream, line))
user_bins.emplace_back(parse_layout_line(line));
}

Expand Down
36 changes: 34 additions & 2 deletions src/sketch/compute_sketches.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
#include <functional> // for function
#include <vector> // for vector

#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
#include <hibf/misc/partition.hpp>
#include <hibf/sketch/compute_sketches.hpp> // for compute_sketches
#include <hibf/sketch/estimate_kmer_counts.hpp> // for estimate_kmer_counts
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog
Expand Down Expand Up @@ -43,4 +44,35 @@ void compute_sketches(config const & config,
sketch::estimate_kmer_counts(sketches, kmer_counts);
}

void compute_sketches(config const & config,
std::vector<std::vector<size_t>> & cardinalities,
std::vector<std::vector<sketch::hyperloglog>> & sketches,
size_t const number_of_partitions)
{
partition_toolbox partition_helper{number_of_partitions};

sketches.resize(number_of_partitions);
cardinalities.resize(number_of_partitions);

for (size_t i = 0; i < number_of_partitions; ++i)
{
sketches[i].resize(config.number_of_user_bins, seqan::hibf::sketch::hyperloglog(config.sketch_bits));
cardinalities[i].resize(config.number_of_user_bins);
}

robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers)
for (size_t i = 0; i < config.number_of_user_bins; ++i)
{
kmers.clear();
config.input_fn(i, insert_iterator{kmers});

for (auto k_hash : kmers)
sketches[partition_helper.hash_partition(k_hash)][i].add(k_hash);
}

for (size_t i = 0; i < number_of_partitions; ++i)
sketch::estimate_kmer_counts(sketches[i], cardinalities[i]);
}

} // namespace seqan::hibf::sketch
17 changes: 15 additions & 2 deletions test/unit/hibf/layout/compute_layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <cstddef> // for size_t
#include <functional> // for function
#include <numeric> // for iota
#include <vector> // for vector, allocator

#include <hibf/config.hpp> // for insert_iterator, config
Expand Down Expand Up @@ -39,8 +40,20 @@ TEST(compute_layout, dispatch)
seqan::hibf::concurrent_timer union_estimation_timer{};
seqan::hibf::concurrent_timer rearrangement_timer{};

auto layout2 =
seqan::hibf::layout::compute_layout(config, kmer_counts, sketches, union_estimation_timer, rearrangement_timer);
std::vector<size_t> positions = [&kmer_counts]()
{
std::vector<size_t> ps;
ps.resize(kmer_counts.size());
std::iota(ps.begin(), ps.end(), 0);
return ps;
}();

auto layout2 = seqan::hibf::layout::compute_layout(config,
kmer_counts,
sketches,
std::move(positions),
union_estimation_timer,
rearrangement_timer);

EXPECT_TRUE(layout1 == layout2);
}
45 changes: 45 additions & 0 deletions test/unit/hibf/layout/layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,48 @@ TEST(layout_test, clear)
EXPECT_TRUE(layout.max_bins.empty());
EXPECT_TRUE(layout.user_bins.empty());
}

TEST(layout_test, read_from_partitioned_layout)
{
// layout consists of three partitions, written one after the other
std::stringstream ss{R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111
#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS
7 0 1
4 1;0 1;22
5 1;2;3;4;22 1;1;1;1;21
#TOP_LEVEL_IBF fullest_technical_bin_idx:111
#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS
7 0 1
4 1;0 1;22
5 1;2;3;4;22 1;1;1;1;21
#TOP_LEVEL_IBF fullest_technical_bin_idx:111
#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS
7 0 1
4 1;0 1;22
5 1;2;3;4;22 1;1;1;1;21
)layout_file"};

for (size_t i = 0; i < 3; ++i)
{
seqan::hibf::layout::layout layout;
layout.read_from(ss);

EXPECT_EQ(layout.top_level_max_bin_id, 111);
EXPECT_EQ(layout.max_bins[0], (seqan::hibf::layout::layout::max_bin{{0}, 0}));
EXPECT_EQ(layout.max_bins[1], (seqan::hibf::layout::layout::max_bin{{2}, 2}));
EXPECT_EQ(layout.max_bins[2], (seqan::hibf::layout::layout::max_bin{{1, 2, 3, 4}, 22}));
EXPECT_EQ(layout.user_bins[0], (seqan::hibf::layout::layout::user_bin{std::vector<size_t>{}, 0, 1, 7}));
EXPECT_EQ(layout.user_bins[1], (seqan::hibf::layout::layout::user_bin{std::vector<size_t>{1}, 0, 22, 4}));
EXPECT_EQ(layout.user_bins[2],
(seqan::hibf::layout::layout::user_bin{std::vector<size_t>{1, 2, 3, 4}, 22, 21, 5}));
}
}