From c339f09ee375b9f04e94684d376783c6a8c1b9af Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sun, 21 Sep 2025 12:19:32 +0200 Subject: [PATCH 1/4] added endpoint sequence --- include/endpoints_sequence.hpp | 296 +++++++++++++++++++++++++++++++ include/util.hpp | 1 + test/test_endpoints_sequence.cpp | 224 +++++++++++++++++++++++ 3 files changed, 521 insertions(+) create mode 100644 include/endpoints_sequence.hpp create mode 100644 test/test_endpoints_sequence.cpp diff --git a/include/endpoints_sequence.hpp b/include/endpoints_sequence.hpp new file mode 100644 index 0000000..bd67a44 --- /dev/null +++ b/include/endpoints_sequence.hpp @@ -0,0 +1,296 @@ +#pragma once + +#include + +#include "bit_vector.hpp" +#include "darray.hpp" +#include "compact_vector.hpp" + +namespace bits { + +/* + This class encodes with Elias-Fano a sequence with the + following properties: + - all elements are distinct; + - the first element is 0; + - when doing `next_geq(x)`, x is always in [0,U) where + U is the last element of the sequence. + + This is the case when the sequence represents a list of + bin sizes in a cumulative way, assuming that each bin is + non empty and we ask what is the bin that comprises element x. + For example: imagine we have a set of strings that we concatenate + together and we keep the list of string boundaries. + If the have 4 strings of length 3, 10, 7, 5 respectively, + then we encode the sequence S = [0, 3, 13, 20, U=25] + + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 + x x x x x + + where U = 3+10+7+5, and length of i-th string is S[i+1]-S[i], + for i = 0..3. + Given the offset x of a character, we ask what is the string that + comprises that offset. In this case, we have that 0 <= x < U. + + The goal of this class is to support fast next_geq queries at the price + of some space increase compared to an `elias_fano` sequence, so: + - The low bits are always 8. + - We keep an array of H of size ceil(U/2^8)-1, where H[i] indicates the + position of the i-th 0 in the high bit array. Each H[i] is written + in 1+log(n) bits. +*/ + +template +struct endpoints_sequence { + endpoints_sequence() : m_back(0) {} + + template + void encode(Iterator begin, uint64_t n, uint64_t universe) { + if (n == 0) return; + + const uint64_t num_high_bits = n + (universe >> 8) + 1; + bit_vector::builder bvb_high_bits(num_high_bits); + m_low_bits.reserve(n); + + compact_vector::builder cv_builder((universe + 256 - 1) / 256 - 1, // ceil(U/2^8)-1 + util::ceil_log2_uint64(num_high_bits)); + + assert(*begin == 0); + uint64_t prev_pos = 0; + for (uint64_t i = 0; i != n; ++i, ++begin) { + auto v = *begin; + m_low_bits.push_back(v & 255); + uint64_t high_part = v >> 8; + uint64_t pos = high_part + i; + // std::cout << "prev_pos = " << prev_pos << "; pos = " << pos << std::endl; + bvb_high_bits.set(pos, 1); + for (uint64_t j = prev_pos + 1; j < pos; ++j) { + // std::cout << "j = " << j << std::endl; + cv_builder.push_back(j); + } + prev_pos = pos; + m_back = v; + } + + bvb_high_bits.build(m_high_bits); + // for (uint64_t i = 0; i != m_high_bits.num_bits(); ++i) { + // std::cout << m_high_bits.get(i) << ' '; + // } + // std::cout << std::endl; + m_high_bits_d1.build(m_high_bits); + cv_builder.build(m_high_bits_d0); + // auto it = m_high_bits_d0.begin(); + // for (uint64_t i = 0; i != m_high_bits_d0.size(); ++i) { + // std::cout << i << " : " << *it << '\n'; + // ++it; + // } + } + + struct iterator { + iterator() : m_ptr(nullptr), m_pos(0), m_val(0) {} + + iterator(endpoints_sequence const* ptr, uint64_t pos = 0) + : m_ptr(ptr) + , m_pos(pos) + , m_val(0) // + { + if (!has_next() or m_ptr->m_high_bits_d1.num_positions() == 0) return; + uint64_t begin = m_ptr->m_high_bits_d1.select(m_ptr->m_high_bits, m_pos); + m_high_bits_it = m_ptr->m_high_bits.get_iterator_at(begin); + m_low_bits_it = m_ptr->m_low_bits.begin() + m_pos; + read_next_value(); + } + + iterator(endpoints_sequence const* ptr, uint64_t pos, uint64_t high_hint) + : m_ptr(ptr) + , m_pos(pos) + , m_val(0) // + { + if (!has_next() or m_ptr->m_high_bits_d1.num_positions() == 0) return; + assert(high_hint == 0 || m_ptr->m_high_bits.get(high_hint) == 0); + m_high_bits_it = m_ptr->m_high_bits.get_iterator_at(high_hint); + m_low_bits_it = m_ptr->m_low_bits.begin() + m_pos; + read_next_value(); + } + + bool has_next() const { return m_pos < m_ptr->size(); } + bool has_prev() const { return m_pos > 0; } + uint64_t value() const { return m_val; } + uint64_t position() const { return m_pos; } + + void next() { + ++m_pos; + if (!has_next()) return; + read_next_value(); + } + + /* + Return the value before the current position. + */ + uint64_t prev_value() { + assert(m_pos > 0); + uint64_t pos = m_pos - 1; + /* + `read_next_value()` sets the state ahead of 1 position, + hence must go back by 2 to get previous value. + */ + assert(m_high_bits_it.position() >= 2); + uint64_t high = m_high_bits_it.prev(m_high_bits_it.position() - 2); + assert(high == m_ptr->m_high_bits_d1.select(m_ptr->m_high_bits, pos)); + uint64_t low = *(m_low_bits_it - 2); + return (((high - pos) << 8) | low); + } + + private: + endpoints_sequence const* m_ptr; + uint64_t m_pos; + uint64_t m_val; + bit_vector::iterator m_high_bits_it; + std::vector::const_iterator m_low_bits_it; + + void read_next_value() { + assert(m_pos < m_ptr->size()); + uint64_t high = m_high_bits_it.next(); + assert(high == m_ptr->m_high_bits_d1.select(m_ptr->m_high_bits, m_pos)); + uint64_t low = *m_low_bits_it; + m_val = (((high - m_pos) << 8) | low); + ++m_low_bits_it; + } + }; + + iterator get_iterator_at(uint64_t pos) const { return iterator(this, pos); } + iterator begin() const { return get_iterator_at(0); } + + uint64_t access(uint64_t i) const { + assert(i < size()); + return ((m_high_bits_d1.select(m_high_bits, i) - i) << 8) | m_low_bits[i]; + } + + struct return_value { + uint64_t pos; + uint64_t val; + }; + + /* + Return [position,value] of the smallest element that is >= x. + */ + return_value next_geq(const uint64_t x) const { return _next_geq(x).first; } + + /* + Determine integers lo and hi, with lo < hi, such that lo <= x < hi, where: + - lo is the largest value that is <= x, + - hi is the smallest value that is > x. + Return the tuple [lo_pos, lo, hi_pos, hi]. + */ + std::pair locate(const uint64_t x) const { + auto [lo, it] = _next_geq(x); + return_value hi{lo.pos, lo.val}; + + if (lo.val > x) { + assert(lo.pos > 0); + lo.pos -= 1; + lo.val = it.prev_value(); + } else { + hi.pos += 1; + hi.val = (it.next(), it.value()); + assert(it.position() == hi.pos); + assert(hi.pos < size()); + } + + return {lo, hi}; + } + + uint64_t back() const { return m_back; } + uint64_t size() const { return m_low_bits.size(); } + + uint64_t num_bytes() const { + // std::cout << "m_high_bits = " << (8.0 * m_high_bits.num_bytes()) / size() << std::endl; + // std::cout << "m_high_bits_d1 = " << (8.0 * m_high_bits_d1.num_bytes()) / size() + // << std::endl; + // std::cout << "m_high_bits_d0 = " << (8.0 * m_high_bits_d0.num_bytes()) / size() + // << std::endl; + // std::cout << "m_low_bits = " << (8.0 * essentials::vec_bytes(m_low_bits)) / size() + // << std::endl; + + return sizeof(m_back) + m_high_bits.num_bytes() + m_high_bits_d1.num_bytes() + + m_high_bits_d0.num_bytes() + essentials::vec_bytes(m_low_bits); + } + + void swap(endpoints_sequence& other) { + std::swap(m_back, other.m_back); + m_high_bits.swap(other.m_high_bits); + m_high_bits_d1.swap(other.m_high_bits_d1); + m_high_bits_d0.swap(other.m_high_bits_d0); + m_low_bits.swap(other.m_low_bits); + } + + template + void visit(Visitor& visitor) const { + visit_impl(visitor, *this); + } + + template + void visit(Visitor& visitor) { + visit_impl(visitor, *this); + } + +private: + uint64_t m_back; + bit_vector m_high_bits; + DArray1 m_high_bits_d1; + compact_vector m_high_bits_d0; + std::vector m_low_bits; + + template + static void visit_impl(Visitor& visitor, T&& t) { + visitor.visit(t.m_back); + visitor.visit(t.m_high_bits); + visitor.visit(t.m_high_bits_d1); + visitor.visit(t.m_high_bits_d0); + visitor.visit(t.m_low_bits); + } + + std::pair _next_geq(const uint64_t x) const // + { + assert(x >= 0); + assert(x < back()); + + uint64_t h_x = x >> 8; + uint64_t p = 0; + uint64_t begin = 0; + if (h_x > 0) { + p = m_high_bits_d0.access(h_x - 1); + begin = p - h_x + 1; + } + assert(begin < size()); + + /* + `begin` is the position of the elements that have high part >= h_x + and `p` is the position in `m_high_bits` of the (h_x-1)-th 0, + so it is passed to the iterator as a hint to recover the high part + of the element at position `begin` without doing a select_1. + */ + + auto it = iterator(this, begin, p /* high hint */); + uint64_t pos = begin; + uint64_t val = it.value(); + while (val < x) { + ++pos; + /* + Note: no need for bound checking here + because x <= back(), hence pos cannot + be equal to size(). + */ + it.next(); + val = it.value(); + } + /* now pos is the position of the element that is >= x */ + assert(val >= x); + assert(pos < size()); + assert(val == access(pos)); + assert(it.position() == pos); + return {{pos, val}, it}; + } +}; + +} // namespace bits \ No newline at end of file diff --git a/include/util.hpp b/include/util.hpp index fd16ffd..32ff70a 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -32,6 +32,7 @@ static inline bool msbll(uint64_t x, uint64_t& ret) { } static inline uint64_t ceil_log2_uint32(uint32_t x) { return (x > 1) ? msb(x - 1) + 1 : 0; } +static inline uint64_t ceil_log2_uint64(uint64_t x) { return (x > 1) ? msbll(x - 1) + 1 : 0; } /* return the position of the least significant bit (lsb) */ static inline uint64_t lsb(uint32_t x) { diff --git a/test/test_endpoints_sequence.cpp b/test/test_endpoints_sequence.cpp new file mode 100644 index 0000000..f59e1d0 --- /dev/null +++ b/test/test_endpoints_sequence.cpp @@ -0,0 +1,224 @@ +#include "common.hpp" +#include "include/endpoints_sequence.hpp" + +constexpr uint64_t sequence_length = 10000; + +std::vector get_sequence(const uint64_t sequence_length) { + std::vector s = test::get_sequence(sequence_length); + std::vector seq; + seq.reserve(s.size() + 1); + uint64_t sum = 0; + for (uint64_t i = 0; i != s.size(); ++i) { + seq.push_back(sum); + sum += s[i] + (s[i] == 0); // +1 so that all values are distinct + } + seq.push_back(sum); + assert(std::adjacent_find(seq.begin(), seq.end()) == seq.end()); // must be all distinct + return seq; +} + +endpoints_sequence<> encode(std::vector const& seq) { + endpoints_sequence<> es; + assert(seq.front() == 0); + es.encode(seq.begin(), seq.size(), seq.back()); + REQUIRE(es.size() == seq.size()); + std::cout << "es.size() = " << es.size() << '\n'; + std::cout << "es.back() = " << es.back() << '\n'; + std::cout << "measured bits/int = " << (8.0 * es.num_bytes()) / es.size() << std::endl; + return es; +} + +TEST_CASE("access") { + std::vector seq = get_sequence(sequence_length); + auto es = encode(seq); + std::cout << "checking correctness of random access..." << std::endl; + for (uint64_t i = 0; i != es.size(); ++i) { + uint64_t got = es.access(i); // get the integer at position i + uint64_t expected = seq[i]; + REQUIRE_MESSAGE(got == expected, "got " << got << " at position " << i << "/" << es.size() + << " but expected " << expected); + } + std::cout << "EVERYTHING OK!" << std::endl; +} + +TEST_CASE("iterator::next") { + std::vector seq = get_sequence(sequence_length); + auto es = encode(seq); + + std::cout << "checking correctness of iterator..." << std::endl; + + auto it = es.begin(); + for (uint64_t i = 0; i != es.size(); ++i, it.next()) { + uint64_t got = it.value(); + uint64_t expected = seq[i]; + REQUIRE_MESSAGE(got == expected, "got " << got << " at position " << i << "/" << es.size() + << " but expected " << expected); + REQUIRE(it.position() == i); + REQUIRE(it.has_next() == true); + } + REQUIRE(it.position() == es.size()); + REQUIRE(it.has_next() == false); + + uint64_t i = 0; + it = es.begin(); + while (it.has_next()) { + uint64_t got = it.value(); + uint64_t expected = seq[i]; + REQUIRE_MESSAGE(got == expected, "got " << got << " at position " << i << "/" << es.size() + << " but expected " << expected); + REQUIRE(it.position() == i); + REQUIRE(it.has_next() == true); + it.next(); + ++i; + } + REQUIRE(i == es.size()); + REQUIRE(it.position() == es.size()); + REQUIRE(it.has_next() == false); + + std::cout << "EVERYTHING OK!" << std::endl; +} + +TEST_CASE("next_geq") { + std::vector seq = get_sequence(sequence_length); + auto es = encode(seq); + + std::cout << "checking correctness of next_geq..." << std::endl; + + for (uint64_t i = 0; i != es.size() - 1; ++i) { + auto x = seq[i]; + /* since x is in the sequence, next_geq must return (i,x) */ + + auto p = es.next_geq(x); + uint64_t pos = p.pos; + uint64_t got = p.val; + uint64_t expected = seq[i]; + + // std::cout << "x=" << x << "; pos=" << pos << "; got=" << got << std::endl; + + bool good = (got == expected) && (pos == i); + REQUIRE_MESSAGE(good, "got " << got << " at position " << pos << "/" << seq.size() + << " but expected " << expected << " at position " << i); + } + std::cout << "EVERYTHING OK!" << std::endl; + + std::cout << "checking correctness of next_geq..." << std::endl; + for (uint64_t i = 0; i != 10000; ++i) { + uint64_t x = seq[rand() % seq.size()] + (i % 2 == 0 ? 3 : -3); // get some value + if (x >= seq.back()) x = seq.back() - 1; // lower bound must be x < back() for assumption + + auto p = es.next_geq(x); + uint64_t pos = p.pos; + uint64_t got = p.val; + + // std::cout << "x=" << x << "; pos=" << pos << "; got=" << got << std::endl; + auto it = std::lower_bound(seq.begin(), seq.end(), x); + assert(it != seq.end()); + uint64_t expected = *it; + uint64_t pos_expected = std::distance(seq.begin(), it); + bool good = (got == expected) && (pos == pos_expected); + REQUIRE_MESSAGE(good, "got " << got << " at position " << pos << "/" << seq.size() + << " but expected " << expected << " at position " + << pos_expected); + } + std::cout << "EVERYTHING OK!" << std::endl; +} + +TEST_CASE("small_next_geq") { + std::vector seq{0, 3, 5, 6, 9, 13, 23, 31}; + auto es = encode(seq); + + auto p = es.next_geq(0); + REQUIRE(p.pos == 0); + REQUIRE(p.val == 0); + + p = es.next_geq(2); + REQUIRE(p.pos == 1); + REQUIRE(p.val == 3); + + p = es.next_geq(4); + REQUIRE(p.pos == 2); + REQUIRE(p.val == 5); + + p = es.next_geq(6); + REQUIRE(p.pos == 3); + REQUIRE(p.val == 6); + + p = es.next_geq(23); + REQUIRE(p.pos == 6); + REQUIRE(p.val == 23); + + p = es.next_geq(28); + REQUIRE(p.pos == 7); + REQUIRE(p.val == 31); + + p = es.next_geq(30); + REQUIRE(p.pos == 7); + REQUIRE(p.val == 31); +} + +TEST_CASE("locate") { + std::vector seq = get_sequence(sequence_length); + auto es = encode(seq); + // test::print(seq); + + std::cout << "checking correctness of locate..." << std::endl; + + for (uint64_t i = 0; i != seq.size() - 1; ++i) // + { + auto x = seq[i]; + + /* since x is in the sequence, p.first.pos must be i */ + auto p = es.locate(x); + uint64_t lo_pos = p.first.pos; + uint64_t lo_val = p.first.val; + uint64_t hi_pos = p.second.pos; + uint64_t hi_val = p.second.val; + + uint64_t expected_lo_pos = i; + uint64_t expected_lo_val = seq[i]; + uint64_t expected_hi_pos = i + 1; + uint64_t expected_hi_val = seq[i + 1]; + + bool good = (lo_val == expected_lo_val) && (lo_pos == expected_lo_pos); + REQUIRE_MESSAGE(good, "got " << lo_val << " at position " << lo_pos << "/" << seq.size() + << " but expected " << expected_lo_val << " at position " + << expected_lo_pos); + good = (hi_val == expected_hi_val) && (hi_pos == expected_hi_pos); + REQUIRE_MESSAGE(good, "got " << hi_val << " at position " << hi_pos << "/" << seq.size() + << " but expected " << expected_hi_val << " at position " + << expected_hi_pos); + } + std::cout << "EVERYTHING OK!" << std::endl; + + std::cout << "checking correctness of locate..." << std::endl; + + for (uint64_t i = 0; i != 10000; ++i) // + { + uint64_t x = seq[rand() % es.size()] + (i % 2 == 0 ? 3 : -3); // get some value value + if (x >= seq.back()) x = seq.back() - 1; // lower bound must be x < back() for assumption + + auto p = es.locate(x); + uint64_t lo_pos = p.first.pos; + uint64_t lo_val = p.first.val; + uint64_t hi_pos = p.second.pos; + uint64_t hi_val = p.second.val; + + auto it = std::upper_bound(seq.begin(), seq.end(), x) - 1; + uint64_t expected_lo_pos = std::distance(seq.begin(), it); + assert(expected_lo_pos != es.size() - 1); + uint64_t expected_lo_val = *it; + uint64_t expected_hi_pos = expected_lo_pos + 1; + uint64_t expected_hi_val = *(it + 1); + + bool good = (lo_val == expected_lo_val) && (lo_pos == expected_lo_pos); + REQUIRE_MESSAGE(good, "got " << lo_val << " at position " << lo_pos << "/" << seq.size() + << " but expected " << expected_lo_val << " at position " + << expected_lo_pos); + good = (hi_val == expected_hi_val) && (hi_pos == expected_hi_pos); + REQUIRE_MESSAGE(good, "got " << hi_val << " at position " << hi_pos << "/" << seq.size() + << " but expected " << expected_hi_val << " at position " + << expected_hi_pos); + } + + std::cout << "EVERYTHING OK!" << std::endl; +} From abb23fb8001d823b0f838fbc9c1b13659e5edf19 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sun, 21 Sep 2025 21:29:49 +0200 Subject: [PATCH 2/4] added alternative iterator to endpoint sequence --- include/endpoints_sequence.hpp | 208 ++++++++++++++++++++++++++----- test/common.hpp | 1 + test/test_endpoints_sequence.cpp | 10 ++ 3 files changed, 191 insertions(+), 28 deletions(-) diff --git a/include/endpoints_sequence.hpp b/include/endpoints_sequence.hpp index bd67a44..db5121a 100644 --- a/include/endpoints_sequence.hpp +++ b/include/endpoints_sequence.hpp @@ -35,7 +35,7 @@ namespace bits { The goal of this class is to support fast next_geq queries at the price of some space increase compared to an `elias_fano` sequence, so: - The low bits are always 8. - - We keep an array of H of size ceil(U/2^8)-1, where H[i] indicates the + - We keep an array of H of size ceil(U/2^8), where H[i] indicates the position of the i-th 0 in the high bit array. Each H[i] is written in 1+log(n) bits. */ @@ -52,7 +52,7 @@ struct endpoints_sequence { bit_vector::builder bvb_high_bits(num_high_bits); m_low_bits.reserve(n); - compact_vector::builder cv_builder((universe + 256 - 1) / 256 - 1, // ceil(U/2^8)-1 + compact_vector::builder cv_builder((universe + 256 - 1) / 256, // ceil(U/2^8) util::ceil_log2_uint64(num_high_bits)); assert(*begin == 0); @@ -62,23 +62,21 @@ struct endpoints_sequence { m_low_bits.push_back(v & 255); uint64_t high_part = v >> 8; uint64_t pos = high_part + i; - // std::cout << "prev_pos = " << prev_pos << "; pos = " << pos << std::endl; bvb_high_bits.set(pos, 1); - for (uint64_t j = prev_pos + 1; j < pos; ++j) { - // std::cout << "j = " << j << std::endl; - cv_builder.push_back(j); - } + for (uint64_t j = prev_pos + 1; j < pos; ++j) cv_builder.push_back(j); prev_pos = pos; m_back = v; } + cv_builder.push_back(prev_pos + 1); bvb_high_bits.build(m_high_bits); + m_high_bits_d1.build(m_high_bits); + cv_builder.build(m_high_bits_d0); + // for (uint64_t i = 0; i != m_high_bits.num_bits(); ++i) { // std::cout << m_high_bits.get(i) << ' '; // } // std::cout << std::endl; - m_high_bits_d1.build(m_high_bits); - cv_builder.build(m_high_bits_d0); // auto it = m_high_bits_d0.begin(); // for (uint64_t i = 0; i != m_high_bits_d0.size(); ++i) { // std::cout << i << " : " << *it << '\n'; @@ -153,8 +151,142 @@ struct endpoints_sequence { uint64_t high = m_high_bits_it.next(); assert(high == m_ptr->m_high_bits_d1.select(m_ptr->m_high_bits, m_pos)); uint64_t low = *m_low_bits_it; - m_val = (((high - m_pos) << 8) | low); + m_val = ((high - m_pos) << 8) | low; + ++m_low_bits_it; + } + }; + + /* + This version of iterator uses an iterator over the `low_bits` + and an iterator over `high_bits_d0`. + This is useful in next_geq and locate because only + these two sequences are accessed. + The class `iterator` accesses the `high_bits`, so when + used in `next_geq` and `locate`, we have accesses to three + sequences in total: `low_bits`, `high_bits`, and `high_bits_d0`. + */ + struct _iterator { + _iterator() : m_ptr(nullptr), m_pos(0), m_val(0) {} + + _iterator(endpoints_sequence const* ptr) + : m_ptr(ptr) + , m_pos(0) + , m_val(0) // + { + m_high_part = 0; + m_high_bits_d0_it = m_ptr->m_high_bits_d0.get_iterator_at(0); + m_cur_pos = *m_high_bits_d0_it; + m_prev_pos = m_cur_pos; + m_cluster_size = m_cur_pos; + assert(m_cluster_size > 0); + m_pos_in_cluster = 0; + m_low_bits_it = m_ptr->m_low_bits.begin() + m_pos; + uint64_t low = *m_low_bits_it; + m_val = (m_high_part << 8) | low; + } + + _iterator(endpoints_sequence const* ptr, uint64_t pos, uint64_t high_part) + : m_ptr(ptr) + , m_pos(pos) + , m_val(0) // + { + m_high_part = high_part; + m_high_bits_d0_it = m_ptr->m_high_bits_d0.get_iterator_at(high_part); + m_prev_pos = *m_high_bits_d0_it; + m_cluster_size = 0; + m_pos_in_cluster = 0; + + while (m_cluster_size == 0) { + m_high_part += 1; + ++m_high_bits_d0_it; + m_cur_pos = *m_high_bits_d0_it; + m_cluster_size = m_cur_pos - m_prev_pos - 1; + m_prev_pos = m_cur_pos; + } + m_low_bits_it = m_ptr->m_low_bits.begin() + m_pos; + + uint64_t low = *m_low_bits_it; + m_val = (m_high_part << 8) | low; + } + + bool has_next() const { return m_pos < m_ptr->size(); } + bool has_prev() const { return m_pos > 0; } + uint64_t value() const { return m_val; } + uint64_t position() const { return m_pos; } + + void next() { + ++m_pos; + ++m_pos_in_cluster; + if (!has_next()) return; + read_next_value(); + } + + /* + Return the value before the current position. + */ + uint64_t prev_value() { + assert(m_pos > 0); + + uint64_t high_part_prev_val = m_high_part; + + if (m_pos_in_cluster == 0) { + uint64_t cur_pos = m_cur_pos; + uint64_t prev_pos = m_cur_pos; + assert(m_cur_pos == *m_high_bits_d0_it); + uint64_t cluster_size = 0; + + assert(m_high_bits_d0_it != m_ptr->m_high_bits_d0.begin()); + high_part_prev_val -= 1; + --m_high_bits_d0_it; + cur_pos = *m_high_bits_d0_it; + + while (cluster_size == 0 and high_part_prev_val > 0) { + assert(m_high_bits_d0_it != m_ptr->m_high_bits_d0.begin()); + high_part_prev_val -= 1; + --m_high_bits_d0_it; + prev_pos = cur_pos; + cur_pos = *m_high_bits_d0_it; + cluster_size = prev_pos - cur_pos - 1; + } + + if (cluster_size > 0 || high_part_prev_val > 0) high_part_prev_val += 1; + } + + uint64_t low = *(m_low_bits_it - 1); + return (high_part_prev_val << 8) | low; + } + + private: + endpoints_sequence const* m_ptr; + uint64_t m_pos; + uint64_t m_val; + + uint64_t m_cur_pos, m_prev_pos; + uint64_t m_pos_in_cluster, m_cluster_size; + uint64_t m_high_part; + + compact_vector::iterator m_high_bits_d0_it; + std::vector::const_iterator m_low_bits_it; + + void read_next_value() { + assert(m_pos < m_ptr->size()); + + if (m_pos_in_cluster == m_cluster_size) { + m_cluster_size = 0; + m_pos_in_cluster = 0; + while (m_cluster_size == 0) { + m_high_part += 1; + ++m_high_bits_d0_it; + m_cur_pos = *m_high_bits_d0_it; + m_cluster_size = m_cur_pos - m_prev_pos - 1; + m_prev_pos = m_cur_pos; + } + } + assert(m_high_part + m_pos == m_ptr->m_high_bits_d1.select(m_ptr->m_high_bits, m_pos)); + ++m_low_bits_it; + uint64_t low = *m_low_bits_it; + m_val = (m_high_part << 8) | low; } }; @@ -250,39 +382,59 @@ struct endpoints_sequence { visitor.visit(t.m_low_bits); } - std::pair _next_geq(const uint64_t x) const // + std::pair _next_geq(const uint64_t x) const // { assert(x >= 0); assert(x < back()); + // uint64_t h_x = x >> 8; + // uint64_t p = 0; + // uint64_t begin = 0; + // if (h_x > 0) { + // p = m_high_bits_d0.access(h_x - 1); + // begin = p - h_x + 1; + // } + // assert(begin < size()); + + // /* + // `begin` is the position of the elements that have high part >= h_x + // and `p` is the position in `m_high_bits` of the (h_x-1)-th 0, + // so it is passed to the iterator as a hint to recover the high part + // of the element at position `begin` without doing a select_1. + // */ + + // auto it = iterator(this, begin, p /* high hint */); + // uint64_t pos = begin; + // uint64_t val = it.value(); + // while (val < x) { + // ++pos; + // it.next(); + // val = it.value(); + // } + // /* now pos is the position of the element that is >= x */ + // assert(val >= x); + // assert(pos < size()); + // assert(val == access(pos)); + // assert(it.position() == pos); + // return {{pos, val}, it}; + uint64_t h_x = x >> 8; - uint64_t p = 0; uint64_t begin = 0; + _iterator it; if (h_x > 0) { - p = m_high_bits_d0.access(h_x - 1); + uint64_t p = m_high_bits_d0.access(h_x - 1); begin = p - h_x + 1; + it = _iterator(this, begin, h_x - 1); + } else { + it = _iterator(this); } assert(begin < size()); - /* - `begin` is the position of the elements that have high part >= h_x - and `p` is the position in `m_high_bits` of the (h_x-1)-th 0, - so it is passed to the iterator as a hint to recover the high part - of the element at position `begin` without doing a select_1. - */ - - auto it = iterator(this, begin, p /* high hint */); uint64_t pos = begin; uint64_t val = it.value(); while (val < x) { ++pos; - /* - Note: no need for bound checking here - because x <= back(), hence pos cannot - be equal to size(). - */ - it.next(); - val = it.value(); + val = (it.next(), it.value()); } /* now pos is the position of the element that is >= x */ assert(val >= x); diff --git a/test/common.hpp b/test/common.hpp index dbbaef7..8e2a632 100644 --- a/test/common.hpp +++ b/test/common.hpp @@ -18,6 +18,7 @@ std::vector get_sequence(const uint64_t sequence_length, const uint64_t max_int = 1000, // const uint64_t seed = essentials::get_random_seed()) // { + std::cout << "seed = " << seed << std::endl; srand(seed); double mean = rand() % (max_int + 1); std::mt19937 rng(seed); diff --git a/test/test_endpoints_sequence.cpp b/test/test_endpoints_sequence.cpp index f59e1d0..0460ba0 100644 --- a/test/test_endpoints_sequence.cpp +++ b/test/test_endpoints_sequence.cpp @@ -81,6 +81,7 @@ TEST_CASE("iterator::next") { TEST_CASE("next_geq") { std::vector seq = get_sequence(sequence_length); auto es = encode(seq); + // test::print(seq); std::cout << "checking correctness of next_geq..." << std::endl; @@ -111,6 +112,7 @@ TEST_CASE("next_geq") { uint64_t got = p.val; // std::cout << "x=" << x << "; pos=" << pos << "; got=" << got << std::endl; + auto it = std::lower_bound(seq.begin(), seq.end(), x); assert(it != seq.end()); uint64_t expected = *it; @@ -124,6 +126,8 @@ TEST_CASE("next_geq") { } TEST_CASE("small_next_geq") { + std::cout << "checking correctness of next_geq over a small sequence..." << std::endl; + std::vector seq{0, 3, 5, 6, 9, 13, 23, 31}; auto es = encode(seq); @@ -179,6 +183,9 @@ TEST_CASE("locate") { uint64_t expected_hi_pos = i + 1; uint64_t expected_hi_val = seq[i + 1]; + // std::cout << "x = " << x << " lo_pos = " << lo_pos << " lo_val = " << lo_val + // << " hi_pos = " << hi_pos << " hi_val = " << hi_val << std::endl; + bool good = (lo_val == expected_lo_val) && (lo_pos == expected_lo_pos); REQUIRE_MESSAGE(good, "got " << lo_val << " at position " << lo_pos << "/" << seq.size() << " but expected " << expected_lo_val << " at position " @@ -203,6 +210,9 @@ TEST_CASE("locate") { uint64_t hi_pos = p.second.pos; uint64_t hi_val = p.second.val; + // std::cout << "x = " << x << " lo_pos = " << lo_pos << " lo_val = " << lo_val + // << " hi_pos = " << hi_pos << " hi_val = " << hi_val << std::endl; + auto it = std::upper_bound(seq.begin(), seq.end(), x) - 1; uint64_t expected_lo_pos = std::distance(seq.begin(), it); assert(expected_lo_pos != es.size() - 1); From 8bd915bc51a7b675a43874bc7e1073069d6bf1df Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Mon, 22 Sep 2025 09:36:27 +0200 Subject: [PATCH 3/4] alt iterator in endpoints_sequence is actually slower when used in SSHash --- include/endpoints_sequence.hpp | 217 +++++---------------------------- 1 file changed, 28 insertions(+), 189 deletions(-) diff --git a/include/endpoints_sequence.hpp b/include/endpoints_sequence.hpp index db5121a..08cc0be 100644 --- a/include/endpoints_sequence.hpp +++ b/include/endpoints_sequence.hpp @@ -37,7 +37,7 @@ namespace bits { - The low bits are always 8. - We keep an array of H of size ceil(U/2^8), where H[i] indicates the position of the i-th 0 in the high bit array. Each H[i] is written - in 1+log(n) bits. + in 1+log(n) bits because the high bit array has length <= 2n. */ template @@ -99,14 +99,19 @@ struct endpoints_sequence { read_next_value(); } - iterator(endpoints_sequence const* ptr, uint64_t pos, uint64_t high_hint) + iterator(endpoints_sequence const* ptr, uint64_t pos, uint64_t pos_in_high_bits /* hint */) : m_ptr(ptr) , m_pos(pos) , m_val(0) // { - if (!has_next() or m_ptr->m_high_bits_d1.num_positions() == 0) return; - assert(high_hint == 0 || m_ptr->m_high_bits.get(high_hint) == 0); - m_high_bits_it = m_ptr->m_high_bits.get_iterator_at(high_hint); + /* + These two assertions are valid because this constructor is only used + in `next_geq`. + */ + assert(has_next()); + assert(pos_in_high_bits == 0 || m_ptr->m_high_bits.get(pos_in_high_bits) == 0); + + m_high_bits_it = m_ptr->m_high_bits.get_iterator_at(pos_in_high_bits); m_low_bits_it = m_ptr->m_low_bits.begin() + m_pos; read_next_value(); } @@ -130,13 +135,13 @@ struct endpoints_sequence { uint64_t pos = m_pos - 1; /* `read_next_value()` sets the state ahead of 1 position, - hence must go back by 2 to get previous value. + hence must go back by 2 positions to get previous value. */ assert(m_high_bits_it.position() >= 2); uint64_t high = m_high_bits_it.prev(m_high_bits_it.position() - 2); assert(high == m_ptr->m_high_bits_d1.select(m_ptr->m_high_bits, pos)); uint64_t low = *(m_low_bits_it - 2); - return (((high - pos) << 8) | low); + return ((high - pos) << 8) | low; } private: @@ -156,140 +161,6 @@ struct endpoints_sequence { } }; - /* - This version of iterator uses an iterator over the `low_bits` - and an iterator over `high_bits_d0`. - This is useful in next_geq and locate because only - these two sequences are accessed. - The class `iterator` accesses the `high_bits`, so when - used in `next_geq` and `locate`, we have accesses to three - sequences in total: `low_bits`, `high_bits`, and `high_bits_d0`. - */ - struct _iterator { - _iterator() : m_ptr(nullptr), m_pos(0), m_val(0) {} - - _iterator(endpoints_sequence const* ptr) - : m_ptr(ptr) - , m_pos(0) - , m_val(0) // - { - m_high_part = 0; - m_high_bits_d0_it = m_ptr->m_high_bits_d0.get_iterator_at(0); - m_cur_pos = *m_high_bits_d0_it; - m_prev_pos = m_cur_pos; - m_cluster_size = m_cur_pos; - assert(m_cluster_size > 0); - m_pos_in_cluster = 0; - m_low_bits_it = m_ptr->m_low_bits.begin() + m_pos; - uint64_t low = *m_low_bits_it; - m_val = (m_high_part << 8) | low; - } - - _iterator(endpoints_sequence const* ptr, uint64_t pos, uint64_t high_part) - : m_ptr(ptr) - , m_pos(pos) - , m_val(0) // - { - m_high_part = high_part; - m_high_bits_d0_it = m_ptr->m_high_bits_d0.get_iterator_at(high_part); - m_prev_pos = *m_high_bits_d0_it; - m_cluster_size = 0; - m_pos_in_cluster = 0; - - while (m_cluster_size == 0) { - m_high_part += 1; - ++m_high_bits_d0_it; - m_cur_pos = *m_high_bits_d0_it; - m_cluster_size = m_cur_pos - m_prev_pos - 1; - m_prev_pos = m_cur_pos; - } - m_low_bits_it = m_ptr->m_low_bits.begin() + m_pos; - - uint64_t low = *m_low_bits_it; - m_val = (m_high_part << 8) | low; - } - - bool has_next() const { return m_pos < m_ptr->size(); } - bool has_prev() const { return m_pos > 0; } - uint64_t value() const { return m_val; } - uint64_t position() const { return m_pos; } - - void next() { - ++m_pos; - ++m_pos_in_cluster; - if (!has_next()) return; - read_next_value(); - } - - /* - Return the value before the current position. - */ - uint64_t prev_value() { - assert(m_pos > 0); - - uint64_t high_part_prev_val = m_high_part; - - if (m_pos_in_cluster == 0) { - uint64_t cur_pos = m_cur_pos; - uint64_t prev_pos = m_cur_pos; - assert(m_cur_pos == *m_high_bits_d0_it); - uint64_t cluster_size = 0; - - assert(m_high_bits_d0_it != m_ptr->m_high_bits_d0.begin()); - high_part_prev_val -= 1; - --m_high_bits_d0_it; - cur_pos = *m_high_bits_d0_it; - - while (cluster_size == 0 and high_part_prev_val > 0) { - assert(m_high_bits_d0_it != m_ptr->m_high_bits_d0.begin()); - high_part_prev_val -= 1; - --m_high_bits_d0_it; - prev_pos = cur_pos; - cur_pos = *m_high_bits_d0_it; - cluster_size = prev_pos - cur_pos - 1; - } - - if (cluster_size > 0 || high_part_prev_val > 0) high_part_prev_val += 1; - } - - uint64_t low = *(m_low_bits_it - 1); - return (high_part_prev_val << 8) | low; - } - - private: - endpoints_sequence const* m_ptr; - uint64_t m_pos; - uint64_t m_val; - - uint64_t m_cur_pos, m_prev_pos; - uint64_t m_pos_in_cluster, m_cluster_size; - uint64_t m_high_part; - - compact_vector::iterator m_high_bits_d0_it; - std::vector::const_iterator m_low_bits_it; - - void read_next_value() { - assert(m_pos < m_ptr->size()); - - if (m_pos_in_cluster == m_cluster_size) { - m_cluster_size = 0; - m_pos_in_cluster = 0; - while (m_cluster_size == 0) { - m_high_part += 1; - ++m_high_bits_d0_it; - m_cur_pos = *m_high_bits_d0_it; - m_cluster_size = m_cur_pos - m_prev_pos - 1; - m_prev_pos = m_cur_pos; - } - } - assert(m_high_part + m_pos == m_ptr->m_high_bits_d1.select(m_ptr->m_high_bits, m_pos)); - - ++m_low_bits_it; - uint64_t low = *m_low_bits_it; - m_val = (m_high_part << 8) | low; - } - }; - iterator get_iterator_at(uint64_t pos) const { return iterator(this, pos); } iterator begin() const { return get_iterator_at(0); } @@ -306,7 +177,7 @@ struct endpoints_sequence { /* Return [position,value] of the smallest element that is >= x. */ - return_value next_geq(const uint64_t x) const { return _next_geq(x).first; } + return_value next_geq(const uint64_t x) const { return next_geq_helper(x).first; } /* Determine integers lo and hi, with lo < hi, such that lo <= x < hi, where: @@ -315,7 +186,7 @@ struct endpoints_sequence { Return the tuple [lo_pos, lo, hi_pos, hi]. */ std::pair locate(const uint64_t x) const { - auto [lo, it] = _next_geq(x); + auto [lo, it] = next_geq_helper(x); return_value hi{lo.pos, lo.val}; if (lo.val > x) { @@ -336,14 +207,6 @@ struct endpoints_sequence { uint64_t size() const { return m_low_bits.size(); } uint64_t num_bytes() const { - // std::cout << "m_high_bits = " << (8.0 * m_high_bits.num_bytes()) / size() << std::endl; - // std::cout << "m_high_bits_d1 = " << (8.0 * m_high_bits_d1.num_bytes()) / size() - // << std::endl; - // std::cout << "m_high_bits_d0 = " << (8.0 * m_high_bits_d0.num_bytes()) / size() - // << std::endl; - // std::cout << "m_low_bits = " << (8.0 * essentials::vec_bytes(m_low_bits)) / size() - // << std::endl; - return sizeof(m_back) + m_high_bits.num_bytes() + m_high_bits_d1.num_bytes() + m_high_bits_d0.num_bytes() + essentials::vec_bytes(m_low_bits); } @@ -382,65 +245,41 @@ struct endpoints_sequence { visitor.visit(t.m_low_bits); } - std::pair _next_geq(const uint64_t x) const // + std::pair next_geq_helper(const uint64_t x) const // { assert(x >= 0); assert(x < back()); - // uint64_t h_x = x >> 8; - // uint64_t p = 0; - // uint64_t begin = 0; - // if (h_x > 0) { - // p = m_high_bits_d0.access(h_x - 1); - // begin = p - h_x + 1; - // } - // assert(begin < size()); - - // /* - // `begin` is the position of the elements that have high part >= h_x - // and `p` is the position in `m_high_bits` of the (h_x-1)-th 0, - // so it is passed to the iterator as a hint to recover the high part - // of the element at position `begin` without doing a select_1. - // */ - - // auto it = iterator(this, begin, p /* high hint */); - // uint64_t pos = begin; - // uint64_t val = it.value(); - // while (val < x) { - // ++pos; - // it.next(); - // val = it.value(); - // } - // /* now pos is the position of the element that is >= x */ - // assert(val >= x); - // assert(pos < size()); - // assert(val == access(pos)); - // assert(it.position() == pos); - // return {{pos, val}, it}; - uint64_t h_x = x >> 8; + uint64_t p = 0; uint64_t begin = 0; - _iterator it; if (h_x > 0) { - uint64_t p = m_high_bits_d0.access(h_x - 1); + p = m_high_bits_d0.access(h_x - 1); begin = p - h_x + 1; - it = _iterator(this, begin, h_x - 1); - } else { - it = _iterator(this); } assert(begin < size()); + /* + `begin` is the position of the elements that have high part >= h_x + and `p` is the position in `m_high_bits` of the (h_x-1)-th 0, + so it is passed to the iterator as a hint to recover the high part + of the element at position `begin` without doing a select_1. + */ + + auto it = iterator(this, begin, p /* position in high_bits hint */); uint64_t pos = begin; uint64_t val = it.value(); while (val < x) { ++pos; - val = (it.next(), it.value()); + it.next(); + val = it.value(); } /* now pos is the position of the element that is >= x */ assert(val >= x); assert(pos < size()); assert(val == access(pos)); assert(it.position() == pos); + return {{pos, val}, it}; } }; From 44f7876bc1e7c0abc007f27fd0ac6548bb484453 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Mon, 22 Sep 2025 11:08:56 +0200 Subject: [PATCH 4/4] default init compact_vector::enumerator to avoid warning on Linux --- include/compact_vector.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/compact_vector.hpp b/include/compact_vector.hpp index 59c1753..2da67a0 100644 --- a/include/compact_vector.hpp +++ b/include/compact_vector.hpp @@ -14,7 +14,7 @@ struct compact_vector // struct enumerator { using iterator_category = std::random_access_iterator_tag; - enumerator() {} + enumerator() : m_i(0), m_cur_val(0), m_cur_block(0), m_cur_shift(0), m_vec(nullptr) {} enumerator(Vec const* vec, uint64_t i) : m_i(i)