diff --git a/be/src/vec/common/hash_table/hash_table.h b/be/src/vec/common/hash_table/hash_table.h index 26cfc61d3db813..eb840892ff1859 100644 --- a/be/src/vec/common/hash_table/hash_table.h +++ b/be/src/vec/common/hash_table/hash_table.h @@ -421,63 +421,6 @@ class HashTable : private boost::noncopyable, } } - /// Increase the size of the buffer. - void resize(size_t for_num_elems = 0, size_t for_buf_size = 0) { - SCOPED_RAW_TIMER(&_resize_timer_ns); -#ifdef DBMS_HASH_MAP_DEBUG_RESIZES - Stopwatch watch; -#endif - - size_t old_size = grower.buf_size(); - - /** In case of exception for the object to remain in the correct state, - * changing the variable `grower` (which determines the buffer size of the hash table) - * is postponed for a moment after a real buffer change. - * The temporary variable `new_grower` is used to determine the new size. - */ - Grower new_grower = grower; - if (for_num_elems) { - new_grower.set(for_num_elems); - if (new_grower.buf_size() <= old_size) return; - } else if (for_buf_size) { - new_grower.set_buf_size(for_buf_size); - if (new_grower.buf_size() <= old_size) return; - } else - new_grower.increase_size(); - - /// Expand the space. - buf = reinterpret_cast(Allocator::realloc(buf, get_buffer_size_in_bytes(), - new_grower.buf_size() * sizeof(Cell))); - grower = new_grower; - - /** Now some items may need to be moved to a new location. - * The element can stay in place, or move to a new location "on the right", - * or move to the left of the collision resolution chain, because the elements to the left of it have been moved to the new "right" location. - */ - size_t i = 0; - for (; i < old_size; ++i) - if (!buf[i].is_zero(*this) && !buf[i].is_deleted()) - reinsert(buf[i], buf[i].get_hash(*this)); - - /** There is also a special case: - * if the element was to be at the end of the old buffer, [ x] - * but is at the beginning because of the collision resolution chain, [o x] - * then after resizing, it will first be out of place again, [ xo ] - * and in order to transfer it where necessary, - * after transferring all the elements from the old halves you need to [ o x ] - * process tail from the collision resolution chain immediately after it [ o x ] - */ - for (; !buf[i].is_zero(*this) && !buf[i].is_deleted(); ++i) - reinsert(buf[i], buf[i].get_hash(*this)); - -#ifdef DBMS_HASH_MAP_DEBUG_RESIZES - watch.stop(); - std::cerr << std::fixed << std::setprecision(3) << "Resize from " << old_size << " to " - << grower.buf_size() << " took " << watch.elapsedSeconds() << " sec." - << std::endl; -#endif - } - /** Paste into the new buffer the value that was in the old buffer. * Used when increasing the buffer size. */ @@ -613,6 +556,63 @@ class HashTable : private boost::noncopyable, return *this; } + /// Increase the size of the buffer. + void resize(size_t for_num_elems = 0, size_t for_buf_size = 0) { + SCOPED_RAW_TIMER(&_resize_timer_ns); +#ifdef DBMS_HASH_MAP_DEBUG_RESIZES + Stopwatch watch; +#endif + + size_t old_size = grower.buf_size(); + + /** In case of exception for the object to remain in the correct state, + * changing the variable `grower` (which determines the buffer size of the hash table) + * is postponed for a moment after a real buffer change. + * The temporary variable `new_grower` is used to determine the new size. + */ + Grower new_grower = grower; + if (for_num_elems) { + new_grower.set(for_num_elems); + if (new_grower.buf_size() <= old_size) return; + } else if (for_buf_size) { + new_grower.set_buf_size(for_buf_size); + if (new_grower.buf_size() <= old_size) return; + } else + new_grower.increase_size(); + + /// Expand the space. + buf = reinterpret_cast(Allocator::realloc(buf, get_buffer_size_in_bytes(), + new_grower.buf_size() * sizeof(Cell))); + grower = new_grower; + + /** Now some items may need to be moved to a new location. + * The element can stay in place, or move to a new location "on the right", + * or move to the left of the collision resolution chain, because the elements to the left of it have been moved to the new "right" location. + */ + size_t i = 0; + for (; i < old_size; ++i) + if (!buf[i].is_zero(*this) && !buf[i].is_deleted()) + reinsert(buf[i], buf[i].get_hash(*this)); + + /** There is also a special case: + * if the element was to be at the end of the old buffer, [ x] + * but is at the beginning because of the collision resolution chain, [o x] + * then after resizing, it will first be out of place again, [ xo ] + * and in order to transfer it where necessary, + * after transferring all the elements from the old halves you need to [ o x ] + * process tail from the collision resolution chain immediately after it [ o x ] + */ + for (; !buf[i].is_zero(*this) && !buf[i].is_deleted(); ++i) + reinsert(buf[i], buf[i].get_hash(*this)); + +#ifdef DBMS_HASH_MAP_DEBUG_RESIZES + watch.stop(); + std::cerr << std::fixed << std::setprecision(3) << "Resize from " << old_size << " to " + << grower.buf_size() << " took " << watch.elapsedSeconds() << " sec." + << std::endl; +#endif + } + class iterator : public iterator_base { public: using iterator_base::iterator_base; diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp index a42a8215fff729..2bb5f0dd79441a 100644 --- a/be/src/vec/exec/join/vhash_join_node.cpp +++ b/be/src/vec/exec/join/vhash_join_node.cpp @@ -62,6 +62,9 @@ struct ProcessHashTableBuild { _join_node->_mem_used += bucket_bytes - old_bucket_bytes; COUNTER_SET(_join_node->_build_buckets_counter, bucket_size); }}; + //resize ahead is good for keys whose values are almost unique. + //we only resize for the first collected block(<=4G) + hash_table_ctx.hash_table.resize(_rows); KeyGetter key_getter(_build_raw_ptrs, _join_node->_build_key_sz, nullptr);