diff --git a/CMakeLists.txt b/CMakeLists.txt index 47984e6c..0627b00f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -618,8 +618,6 @@ set(LIBPARQUET_SRCS src/parquet/parquet_constants.cpp src/parquet/parquet_types.cpp - - src/parquet/util/cpu-info.cc src/parquet/util/memory.cc ) diff --git a/benchmarks/decode_benchmark.cc b/benchmarks/decode_benchmark.cc index 44776a8f..5514d8b7 100644 --- a/benchmarks/decode_benchmark.cc +++ b/benchmarks/decode_benchmark.cc @@ -40,10 +40,10 @@ class DeltaBitPackEncoder { uint8_t* Encode(int* encoded_len) { uint8_t* result = new uint8_t[10 * 1024 * 1024]; - int num_mini_blocks = parquet::BitUtil::Ceil(num_values() - 1, mini_block_size_); + int num_mini_blocks = arrow::BitUtil::Ceil(num_values() - 1, mini_block_size_); uint8_t* mini_block_widths = NULL; - parquet::BitWriter writer(result, 10 * 1024 * 1024); + arrow::BitWriter writer(result, 10 * 1024 * 1024); // Writer the size of each block. We only use 1 block currently. writer.PutVlqInt(num_mini_blocks * mini_block_size_); @@ -83,7 +83,7 @@ class DeltaBitPackEncoder { // The bit width for this block is the number of bits needed to store // (max_delta - min_delta). - int bit_width = parquet::BitUtil::NumRequiredBits(max_delta - min_delta); + int bit_width = arrow::BitUtil::NumRequiredBits(max_delta - min_delta); mini_block_widths[i] = bit_width; // Encode this mini blocking using min_delta and bit_width diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index 5c4e5653..716debcb 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -109,34 +109,6 @@ set(LIBS ${LIBS} ${Boost_LIBRARIES}) # ---------------------------------------------------------------------- # ZLIB -set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") -set(ZLIB_HOME "${ZLIB_PREFIX}") -set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include") -if (MSVC) - if (${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") - set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib) - else() - set(ZLIB_STATIC_LIB_NAME zlibstatic.lib) - endif() -else() - set(ZLIB_STATIC_LIB_NAME libz.a) -endif() -set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") -set(ZLIB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX} - -DCMAKE_C_FLAGS=${EP_C_FLAGS} - -DBUILD_SHARED_LIBS=OFF) -ExternalProject_Add(zlib_ep - URL "http://zlib.net/fossils/zlib-1.2.8.tar.gz" - BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}" - ${ZLIB_BUILD_BYPRODUCTS} - CMAKE_ARGS ${ZLIB_CMAKE_ARGS}) - -include_directories(SYSTEM ${ZLIB_INCLUDE_DIR}) -add_library(zlibstatic STATIC IMPORTED) -set_target_properties(zlibstatic PROPERTIES IMPORTED_LOCATION ${ZLIB_STATIC_LIB}) -add_dependencies(zlibstatic zlib_ep) - # ---------------------------------------------------------------------- # Thrift @@ -144,6 +116,29 @@ add_dependencies(zlibstatic zlib_ep) find_package(Thrift) if (NOT THRIFT_FOUND) + set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") + set(ZLIB_HOME "${ZLIB_PREFIX}") + set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include") + if (MSVC) + if (${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") + set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib) + else() + set(ZLIB_STATIC_LIB_NAME zlibstatic.lib) + endif() + else() + set(ZLIB_STATIC_LIB_NAME libz.a) + endif() + set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") + set(ZLIB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX} + -DCMAKE_C_FLAGS=${EP_C_FLAGS} + -DBUILD_SHARED_LIBS=OFF) + ExternalProject_Add(zlib_ep + URL "http://zlib.net/fossils/zlib-1.2.8.tar.gz" + BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}" + ${ZLIB_BUILD_BYPRODUCTS} + CMAKE_ARGS ${ZLIB_CMAKE_ARGS}) + set(THRIFT_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/thrift_ep/src/thrift_ep-install") set(THRIFT_HOME "${THRIFT_PREFIX}") set(THRIFT_INCLUDE_DIR "${THRIFT_PREFIX}/include") @@ -341,7 +336,7 @@ if (NOT ARROW_FOUND) -DARROW_BUILD_TESTS=OFF) if ("$ENV{PARQUET_ARROW_VERSION}" STREQUAL "") - set(ARROW_VERSION "98f7cac6e162d9775d615d07b9867c1ec0030f82") + set(ARROW_VERSION "a58893882ac8acd1ac4a5036685cbf09a9a09673") else() set(ARROW_VERSION "$ENV{PARQUET_ARROW_VERSION}") endif() diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc index a3a26c93..dd1c9d26 100644 --- a/src/parquet/arrow/reader.cc +++ b/src/parquet/arrow/reader.cc @@ -26,12 +26,12 @@ #include #include +#include "arrow/api.h" +#include "arrow/util/bit-util.h" + #include "parquet/arrow/schema.h" -#include "parquet/util/bit-util.h" #include "parquet/util/schema-util.h" -#include "arrow/api.h" - using arrow::Array; using arrow::BooleanArray; using arrow::Column; diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc index b8cb45cf..1e3f6de1 100644 --- a/src/parquet/arrow/writer.cc +++ b/src/parquet/arrow/writer.cc @@ -21,14 +21,13 @@ #include #include -#include "parquet/util/bit-util.h" -#include "parquet/util/logging.h" - -#include "parquet/arrow/schema.h" - #include "arrow/api.h" +#include "arrow/util/bit-util.h" #include "arrow/visitor_inline.h" +#include "parquet/arrow/schema.h" +#include "parquet/util/logging.h" + using arrow::Array; using arrow::BinaryArray; using arrow::FixedSizeBinaryArray; diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc index f63f6f18..ce6936dd 100644 --- a/src/parquet/column_reader.cc +++ b/src/parquet/column_reader.cc @@ -21,10 +21,11 @@ #include #include +#include "arrow/util/rle-encoding.h" + #include "parquet/column_page.h" #include "parquet/encoding-internal.h" #include "parquet/properties.h" -#include "parquet/util/rle-encoding.h" using arrow::MemoryPool; @@ -45,7 +46,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, num_bytes = *reinterpret_cast(data); const uint8_t* decoder_data = data + sizeof(int32_t); if (!rle_decoder_) { - rle_decoder_.reset(new RleDecoder(decoder_data, num_bytes, bit_width_)); + rle_decoder_.reset(new ::arrow::RleDecoder(decoder_data, num_bytes, bit_width_)); } else { rle_decoder_->Reset(decoder_data, num_bytes, bit_width_); } @@ -55,7 +56,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, num_bytes = static_cast(BitUtil::Ceil(num_buffered_values * bit_width_, 8)); if (!bit_packed_decoder_) { - bit_packed_decoder_.reset(new BitReader(data, num_bytes)); + bit_packed_decoder_.reset(new ::arrow::BitReader(data, num_bytes)); } else { bit_packed_decoder_->Reset(data, num_bytes); } diff --git a/src/parquet/column_reader.h b/src/parquet/column_reader.h index f4b8b02b..e733d67a 100644 --- a/src/parquet/column_reader.h +++ b/src/parquet/column_reader.h @@ -36,11 +36,15 @@ #include "parquet/util/memory.h" #include "parquet/util/visibility.h" -namespace parquet { +namespace arrow { class BitReader; class RleDecoder; +} // namespace arrow + +namespace parquet { + class PARQUET_EXPORT LevelDecoder { public: LevelDecoder(); @@ -58,8 +62,8 @@ class PARQUET_EXPORT LevelDecoder { int bit_width_; int num_values_remaining_; Encoding::type encoding_; - std::unique_ptr rle_decoder_; - std::unique_ptr bit_packed_decoder_; + std::unique_ptr<::arrow::RleDecoder> rle_decoder_; + std::unique_ptr<::arrow::BitReader> bit_packed_decoder_; }; class PARQUET_EXPORT ColumnReader { diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index c13d4a0f..21550da3 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -17,15 +17,20 @@ #include "parquet/column_writer.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/rle-encoding.h" + #include "parquet/encoding-internal.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/util/logging.h" #include "parquet/util/memory.h" -#include "parquet/util/rle-encoding.h" namespace parquet { +using BitWriter = ::arrow::BitWriter; +using RleEncoder = ::arrow::RleEncoder; + LevelEncoder::LevelEncoder() {} LevelEncoder::~LevelEncoder() {} diff --git a/src/parquet/column_writer.h b/src/parquet/column_writer.h index 4e113de1..1637780e 100644 --- a/src/parquet/column_writer.h +++ b/src/parquet/column_writer.h @@ -30,11 +30,15 @@ #include "parquet/util/memory.h" #include "parquet/util/visibility.h" -namespace parquet { +namespace arrow { class BitWriter; class RleEncoder; +} // namespace arrow + +namespace parquet { + class PARQUET_EXPORT LevelEncoder { public: LevelEncoder(); @@ -61,8 +65,8 @@ class PARQUET_EXPORT LevelEncoder { int bit_width_; int rle_length_; Encoding::type encoding_; - std::unique_ptr rle_encoder_; - std::unique_ptr bit_packed_encoder_; + std::unique_ptr<::arrow::RleEncoder> rle_encoder_; + std::unique_ptr<::arrow::BitWriter> bit_packed_encoder_; }; static constexpr int WRITE_BATCH_SIZE = 1000; diff --git a/src/parquet/encoding-internal.h b/src/parquet/encoding-internal.h index 61b8e242..88d781f9 100644 --- a/src/parquet/encoding-internal.h +++ b/src/parquet/encoding-internal.h @@ -24,21 +24,23 @@ #include #include -#include +#include "arrow/util/bit-stream-utils.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/cpu-info.h" +#include "arrow/util/hash-util.h" +#include "arrow/util/rle-encoding.h" #include "parquet/encoding.h" #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/bit-stream-utils.inline.h" -#include "parquet/util/bit-util.h" -#include "parquet/util/cpu-info.h" -#include "parquet/util/hash-util.h" #include "parquet/util/memory.h" -#include "parquet/util/rle-encoding.h" namespace parquet { +namespace BitUtil = ::arrow::BitUtil; +using HashUtil = ::arrow::HashUtil; + class ColumnDescriptor; // ---------------------------------------------------------------------- @@ -136,7 +138,7 @@ class PlainDecoder : public Decoder { virtual void SetData(int num_values, const uint8_t* data, int len) { num_values_ = num_values; - bit_reader_ = BitReader(data, len); + bit_reader_ = ::arrow::BitReader(data, len); } // Two flavors of bool decoding @@ -161,7 +163,7 @@ class PlainDecoder : public Decoder { } private: - BitReader bit_reader_; + ::arrow::BitReader bit_reader_; }; // ---------------------------------------------------------------------- @@ -196,7 +198,7 @@ class PlainEncoder : public Encoder { bits_available_(kInMemoryDefaultCapacity * 8), bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)), values_sink_(new InMemoryOutputStream(pool)) { - bit_writer_.reset(new BitWriter( + bit_writer_.reset(new ::arrow::BitWriter( bits_buffer_->mutable_data(), static_cast(bits_buffer_->size()))); } @@ -260,7 +262,7 @@ class PlainEncoder : public Encoder { protected: int bits_available_; - std::unique_ptr bit_writer_; + std::unique_ptr<::arrow::BitWriter> bit_writer_; std::shared_ptr bits_buffer_; std::unique_ptr values_sink_; }; @@ -325,12 +327,13 @@ class DictionaryDecoder : public Decoder { uint8_t bit_width = *data; ++data; --len; - idx_decoder_ = RleDecoder(data, len, bit_width); + idx_decoder_ = ::arrow::RleDecoder(data, len, bit_width); } int Decode(T* buffer, int max_values) override { max_values = std::min(max_values, num_values_); - int decoded_values = idx_decoder_.GetBatchWithDict(dictionary_, buffer, max_values); + int decoded_values = + idx_decoder_.GetBatchWithDict(dictionary_.data(), buffer, max_values); if (decoded_values != max_values) { ParquetException::EofException(); } num_values_ -= max_values; return max_values; @@ -338,8 +341,8 @@ class DictionaryDecoder : public Decoder { int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset) override { - int decoded_values = idx_decoder_.GetBatchWithDictSpaced( - dictionary_, buffer, num_values, null_count, valid_bits, valid_bits_offset); + int decoded_values = idx_decoder_.GetBatchWithDictSpaced(dictionary_.data(), buffer, + num_values, null_count, valid_bits, valid_bits_offset); if (decoded_values != num_values) { ParquetException::EofException(); } return decoded_values; } @@ -354,7 +357,7 @@ class DictionaryDecoder : public Decoder { // pointers). std::shared_ptr byte_array_data_; - RleDecoder idx_decoder_; + ::arrow::RleDecoder idx_decoder_; }; template @@ -446,7 +449,7 @@ class DictEncoder : public Encoder { dict_encoded_size_(0), type_length_(desc->type_length()) { hash_slots_.Assign(hash_table_size_, HASH_SLOT_EMPTY); - if (!CpuInfo::initialized()) { CpuInfo::Init(); } + if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); } } virtual ~DictEncoder() { DCHECK(buffered_indices_.empty()); } @@ -464,9 +467,9 @@ class DictEncoder : public Encoder { // reserve // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used // but not reserving them would cause the encoder to fail. - return 1 + RleEncoder::MaxBufferSize( + return 1 + ::arrow::RleEncoder::MaxBufferSize( bit_width(), static_cast(buffered_indices_.size())) + - RleEncoder::MinBufferSize(bit_width()); + ::arrow::RleEncoder::MinBufferSize(bit_width()); } /// The minimum bit width required to encode the currently buffered indices. @@ -727,7 +730,7 @@ inline int DictEncoder::WriteIndices(uint8_t* buffer, int buffer_len) { ++buffer; --buffer_len; - RleEncoder encoder(buffer, buffer_len, bit_width()); + ::arrow::RleEncoder encoder(buffer, buffer_len, bit_width()); for (int index : buffered_indices_) { if (!encoder.Put(index)) return -1; } @@ -756,7 +759,7 @@ class DeltaBitPackDecoder : public Decoder { virtual void SetData(int num_values, const uint8_t* data, int len) { num_values_ = num_values; - decoder_ = BitReader(data, len); + decoder_ = ::arrow::BitReader(data, len); values_current_block_ = 0; values_current_mini_block_ = 0; } @@ -819,7 +822,7 @@ class DeltaBitPackDecoder : public Decoder { return max_values; } - BitReader decoder_; + ::arrow::BitReader decoder_; int32_t values_current_block_; int32_t num_mini_blocks_; uint64_t values_per_mini_block_; diff --git a/src/parquet/encoding-test.cc b/src/parquet/encoding-test.cc index 2e780365..dcd813d3 100644 --- a/src/parquet/encoding-test.cc +++ b/src/parquet/encoding-test.cc @@ -22,10 +22,11 @@ #include #include +#include "arrow/util/bit-util.h" + #include "parquet/encoding-internal.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/bit-util.h" #include "parquet/util/memory.h" #include "parquet/util/test-common.h" @@ -64,7 +65,7 @@ TEST(VectorBooleanTest, TestEncodeDecode) { ASSERT_EQ(nvalues, values_decoded); for (int i = 0; i < nvalues; ++i) { - ASSERT_EQ(draws[i], BitUtil::GetArrayBit(decode_data, i)) << i; + ASSERT_EQ(draws[i], BitUtil::GetBit(decode_data, i)) << i; } } diff --git a/src/parquet/encoding.h b/src/parquet/encoding.h index 7c51cfd1..1417e984 100644 --- a/src/parquet/encoding.h +++ b/src/parquet/encoding.h @@ -21,12 +21,11 @@ #include #include -#include +#include "arrow/util/bit-util.h" #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/bit-util.h" #include "parquet/util/memory.h" namespace parquet { diff --git a/src/parquet/types.h b/src/parquet/types.h index 7ec38250..38015c4d 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -24,7 +24,8 @@ #include #include -#include "parquet/util/compiler-util.h" +#include "arrow/util/compiler-util.h" + #include "parquet/util/visibility.h" namespace parquet { diff --git a/src/parquet/util/CMakeLists.txt b/src/parquet/util/CMakeLists.txt index e8fbdc74..47172667 100644 --- a/src/parquet/util/CMakeLists.txt +++ b/src/parquet/util/CMakeLists.txt @@ -17,19 +17,11 @@ # Headers: util install(FILES - bit-stream-utils.h - bit-stream-utils.inline.h - bit-util.h buffer-builder.h - compiler-util.h - cpu-info.h - hash-util.h logging.h macros.h memory.h - rle-encoding.h stopwatch.h - sse-util.h visibility.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/util") @@ -52,7 +44,5 @@ if (PARQUET_BUILD_BENCHMARKS) endif() endif() -ADD_PARQUET_TEST(bit-util-test) ADD_PARQUET_TEST(comparison-test) ADD_PARQUET_TEST(memory-test) -ADD_PARQUET_TEST(rle-test) diff --git a/src/parquet/util/bit-stream-utils.h b/src/parquet/util/bit-stream-utils.h deleted file mode 100644 index 497a9606..00000000 --- a/src/parquet/util/bit-stream-utils.h +++ /dev/null @@ -1,170 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29 - -#ifndef PARQUET_UTIL_BIT_STREAM_UTILS_H -#define PARQUET_UTIL_BIT_STREAM_UTILS_H - -#include -#include -#include - -#include "parquet/util/bit-util.h" -#include "parquet/util/compiler-util.h" -#include "parquet/util/logging.h" - -namespace parquet { - -/// Utility class to write bit/byte streams. This class can write data to either be -/// bit packed or byte aligned (and a single stream that has a mix of both). -/// This class does not allocate memory. -class BitWriter { - public: - /// buffer: buffer to write bits to. Buffer should be preallocated with - /// 'buffer_len' bytes. - BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) { - Clear(); - } - - void Clear() { - buffered_values_ = 0; - byte_offset_ = 0; - bit_offset_ = 0; - } - - /// The number of current bytes written, including the current byte (i.e. may include a - /// fraction of a byte). Includes buffered values. - int bytes_written() const { - return byte_offset_ + static_cast(BitUtil::Ceil(bit_offset_, 8)); - } - uint8_t* buffer() const { return buffer_; } - int buffer_len() const { return max_bytes_; } - - /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit - /// packed. Returns false if there was not enough space. num_bits must be <= 32. - bool PutValue(uint64_t v, int num_bits); - - /// Writes v to the next aligned byte using num_bytes. If T is larger than - /// num_bytes, the extra high-order bytes will be ignored. Returns false if - /// there was not enough space. - template - bool PutAligned(T v, int num_bytes); - - /// Write a Vlq encoded int to the buffer. Returns false if there was not enough - /// room. The value is written byte aligned. - /// For more details on vlq: - /// en.wikipedia.org/wiki/Variable-length_quantity - bool PutVlqInt(uint32_t v); - - // Writes an int zigzag encoded. - bool PutZigZagVlqInt(int32_t v); - - /// Get a pointer to the next aligned byte and advance the underlying buffer - /// by num_bytes. - /// Returns NULL if there was not enough space. - uint8_t* GetNextBytePtr(int num_bytes = 1); - - /// Flushes all buffered values to the buffer. Call this when done writing to - /// the buffer. If 'align' is true, buffered_values_ is reset and any future - /// writes will be written to the next byte boundary. - void Flush(bool align = false); - - private: - uint8_t* buffer_; - int max_bytes_; - - /// Bit-packed values are initially written to this variable before being memcpy'd to - /// buffer_. This is faster than writing values byte by byte directly to buffer_. - uint64_t buffered_values_; - - int byte_offset_; // Offset in buffer_ - int bit_offset_; // Offset in buffered_values_ -}; - -/// Utility class to read bit/byte stream. This class can read bits or bytes -/// that are either byte aligned or not. It also has utilities to read multiple -/// bytes in one read (e.g. encoded int). -class BitReader { - public: - /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. - BitReader(const uint8_t* buffer, int buffer_len) - : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) { - int num_bytes = std::min(8, max_bytes_ - byte_offset_); - memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); - } - - BitReader() : buffer_(NULL), max_bytes_(0) {} - - void Reset(const uint8_t* buffer, int buffer_len) { - buffer_ = buffer; - max_bytes_ = buffer_len; - byte_offset_ = 0; - bit_offset_ = 0; - int num_bytes = std::min(8, max_bytes_ - byte_offset_); - memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); - } - - /// Gets the next value from the buffer. Returns true if 'v' could be read or false if - /// there are not enough bytes left. num_bits must be <= 32. - template - bool GetValue(int num_bits, T* v); - - /// Get a number of values from the buffer. Return the number of values actually read. - template - int GetBatch(int num_bits, T* v, int batch_size); - - /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T - /// needs to be a little-endian native type and big enough to store - /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will - /// be advanced to the start of the next byte before 'v' is read. Returns - /// false if there are not enough bytes left. - template - bool GetAligned(int num_bytes, T* v); - - /// Reads a vlq encoded int from the stream. The encoded int must start at - /// the beginning of a byte. Return false if there were not enough bytes in - /// the buffer. - bool GetVlqInt(int32_t* v); - - // Reads a zigzag encoded int `into` v. - bool GetZigZagVlqInt(int32_t* v); - - /// Returns the number of bytes left in the stream, not including the current - /// byte (i.e., there may be an additional fraction of a byte). - int bytes_left() { - return max_bytes_ - (byte_offset_ + static_cast(BitUtil::Ceil(bit_offset_, 8))); - } - - /// Maximum byte length of a vlq encoded int - static const int MAX_VLQ_BYTE_LEN = 5; - - private: - const uint8_t* buffer_; - int max_bytes_; - - /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is - /// faster than reading values byte by byte directly from buffer_. - uint64_t buffered_values_; - - int byte_offset_; // Offset in buffer_ - int bit_offset_; // Offset in buffered_values_ -}; - -} // namespace parquet - -#endif // PARQUET_UTIL_BIT_STREAM_UTILS_H diff --git a/src/parquet/util/bit-stream-utils.inline.h b/src/parquet/util/bit-stream-utils.inline.h deleted file mode 100644 index 5db16399..00000000 --- a/src/parquet/util/bit-stream-utils.inline.h +++ /dev/null @@ -1,258 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29 - -#ifndef PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H -#define PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H - -#include - -#include "parquet/util/bit-stream-utils.h" -#include "parquet/util/bpacking.h" - -namespace parquet { - -inline bool BitWriter::PutValue(uint64_t v, int num_bits) { - // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases) - DCHECK_LE(num_bits, 32); - DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits; - - if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false; - - buffered_values_ |= v << bit_offset_; - bit_offset_ += num_bits; - - if (UNLIKELY(bit_offset_ >= 64)) { - // Flush buffered_values_ and write out bits of v that did not fit - memcpy(buffer_ + byte_offset_, &buffered_values_, 8); - buffered_values_ = 0; - byte_offset_ += 8; - bit_offset_ -= 64; - buffered_values_ = v >> (num_bits - bit_offset_); - } - DCHECK_LT(bit_offset_, 64); - return true; -} - -inline void BitWriter::Flush(bool align) { - int num_bytes = static_cast(BitUtil::Ceil(bit_offset_, 8)); - DCHECK_LE(byte_offset_ + num_bytes, max_bytes_); - memcpy(buffer_ + byte_offset_, &buffered_values_, num_bytes); - - if (align) { - buffered_values_ = 0; - byte_offset_ += num_bytes; - bit_offset_ = 0; - } -} - -inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) { - Flush(/* align */ true); - DCHECK_LE(byte_offset_, max_bytes_); - if (byte_offset_ + num_bytes > max_bytes_) return NULL; - uint8_t* ptr = buffer_ + byte_offset_; - byte_offset_ += num_bytes; - return ptr; -} - -template -inline bool BitWriter::PutAligned(T val, int num_bytes) { - uint8_t* ptr = GetNextBytePtr(num_bytes); - if (ptr == NULL) return false; - memcpy(ptr, &val, num_bytes); - return true; -} - -inline bool BitWriter::PutVlqInt(uint32_t v) { - bool result = true; - while ((v & 0xFFFFFF80) != 0L) { - result &= PutAligned((v & 0x7F) | 0x80, 1); - v >>= 7; - } - result &= PutAligned(v & 0x7F, 1); - return result; -} - -template -inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer, - int* bit_offset, int* byte_offset, uint64_t* buffered_values) { -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4800) -#endif - *v = static_cast( - BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >> *bit_offset); -#ifdef _MSC_VER -#pragma warning(pop) -#endif - *bit_offset += num_bits; - if (*bit_offset >= 64) { - *byte_offset += 8; - *bit_offset -= 64; - - int bytes_remaining = max_bytes - *byte_offset; - if (LIKELY(bytes_remaining >= 8)) { - memcpy(buffered_values, buffer + *byte_offset, 8); - } else { - memcpy(buffered_values, buffer + *byte_offset, bytes_remaining); - } -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4800 4805) -#endif - // Read bits of v that crossed into new buffered_values_ - *v |= BitUtil::TrailingBits(*buffered_values, *bit_offset) - << (num_bits - *bit_offset); -#ifdef _MSC_VER -#pragma warning(pop) -#endif - DCHECK_LE(*bit_offset, 64); - } -} - -template -inline bool BitReader::GetValue(int num_bits, T* v) { - return GetBatch(num_bits, v, 1) == 1; -} - -template -inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { - DCHECK(buffer_ != NULL); - // TODO: revisit this limit if necessary - DCHECK_LE(num_bits, 32); - DCHECK_LE(num_bits, static_cast(sizeof(T) * 8)); - - int bit_offset = bit_offset_; - int byte_offset = byte_offset_; - uint64_t buffered_values = buffered_values_; - int max_bytes = max_bytes_; - const uint8_t* buffer = buffer_; - - uint64_t needed_bits = num_bits * batch_size; - uint64_t remaining_bits = (max_bytes - byte_offset) * 8 - bit_offset; - if (remaining_bits < needed_bits) { - batch_size = static_cast(remaining_bits) / num_bits; - } - - int i = 0; - if (UNLIKELY(bit_offset != 0)) { - for (; i < batch_size && bit_offset != 0; ++i) { - GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, - &buffered_values); - } - } - - if (sizeof(T) == 4) { - int num_unpacked = unpack32(reinterpret_cast(buffer + byte_offset), - reinterpret_cast(v + i), batch_size - i, num_bits); - i += num_unpacked; - byte_offset += num_unpacked * num_bits / 8; - } else { - const int buffer_size = 1024; - uint32_t unpack_buffer[buffer_size]; - while (i < batch_size) { - int unpack_size = std::min(buffer_size, batch_size - i); - int num_unpacked = unpack32(reinterpret_cast(buffer + byte_offset), - unpack_buffer, unpack_size, num_bits); - if (num_unpacked == 0) { break; } - for (int k = 0; k < num_unpacked; ++k) { -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4800) -#endif - v[i + k] = unpack_buffer[k]; -#ifdef _MSC_VER -#pragma warning(pop) -#endif - } - i += num_unpacked; - byte_offset += num_unpacked * num_bits / 8; - } - } - - int bytes_remaining = max_bytes - byte_offset; - if (bytes_remaining >= 8) { - memcpy(&buffered_values, buffer + byte_offset, 8); - } else { - memcpy(&buffered_values, buffer + byte_offset, bytes_remaining); - } - - for (; i < batch_size; ++i) { - GetValue_( - num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, &buffered_values); - } - - bit_offset_ = bit_offset; - byte_offset_ = byte_offset; - buffered_values_ = buffered_values; - - return batch_size; -} - -template -inline bool BitReader::GetAligned(int num_bytes, T* v) { - DCHECK_LE(num_bytes, static_cast(sizeof(T))); - int bytes_read = static_cast(BitUtil::Ceil(bit_offset_, 8)); - if (UNLIKELY(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false; - - // Advance byte_offset to next unread byte and read num_bytes - byte_offset_ += bytes_read; - memcpy(v, buffer_ + byte_offset_, num_bytes); - byte_offset_ += num_bytes; - - // Reset buffered_values_ - bit_offset_ = 0; - int bytes_remaining = max_bytes_ - byte_offset_; - if (LIKELY(bytes_remaining >= 8)) { - memcpy(&buffered_values_, buffer_ + byte_offset_, 8); - } else { - memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); - } - return true; -} - -inline bool BitReader::GetVlqInt(int32_t* v) { - *v = 0; - int shift = 0; - int num_bytes = 0; - uint8_t byte = 0; - do { - if (!GetAligned(1, &byte)) return false; - *v |= (byte & 0x7F) << shift; - shift += 7; - DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN); - } while ((byte & 0x80) != 0); - return true; -} - -inline bool BitWriter::PutZigZagVlqInt(int32_t v) { - uint32_t u = (v << 1) ^ (v >> 31); - return PutVlqInt(u); -} - -inline bool BitReader::GetZigZagVlqInt(int32_t* v) { - int32_t u_signed; - if (!GetVlqInt(&u_signed)) return false; - uint32_t u = static_cast(u_signed); - *reinterpret_cast(v) = (u >> 1) ^ -(static_cast(u & 1)); - return true; -} - -} // namespace parquet - -#endif // PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H diff --git a/src/parquet/util/bit-util-test.cc b/src/parquet/util/bit-util-test.cc deleted file mode 100644 index bc3e182c..00000000 --- a/src/parquet/util/bit-util-test.cc +++ /dev/null @@ -1,190 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29 - -#include -#include -#include -#include - -#include - -#include - -#include "parquet/util/bit-stream-utils.inline.h" -#include "parquet/util/bit-util.h" -#include "parquet/util/cpu-info.h" - -namespace parquet { - -static void ensure_cpu_info_initialized() { - if (!CpuInfo::initialized()) { CpuInfo::Init(); } -} - -TEST(BitUtil, Ceil) { - EXPECT_EQ(BitUtil::Ceil(0, 1), 0); - EXPECT_EQ(BitUtil::Ceil(1, 1), 1); - EXPECT_EQ(BitUtil::Ceil(1, 2), 1); - EXPECT_EQ(BitUtil::Ceil(1, 8), 1); - EXPECT_EQ(BitUtil::Ceil(7, 8), 1); - EXPECT_EQ(BitUtil::Ceil(8, 8), 1); - EXPECT_EQ(BitUtil::Ceil(9, 8), 2); - EXPECT_EQ(BitUtil::Ceil(9, 9), 1); - EXPECT_EQ(BitUtil::Ceil(10000000000, 10), 1000000000); - EXPECT_EQ(BitUtil::Ceil(10, 10000000000), 1); - EXPECT_EQ(BitUtil::Ceil(100000000000, 10000000000), 10); -} - -TEST(BitUtil, RoundUp) { - EXPECT_EQ(BitUtil::RoundUp(0, 1), 0); - EXPECT_EQ(BitUtil::RoundUp(1, 1), 1); - EXPECT_EQ(BitUtil::RoundUp(1, 2), 2); - EXPECT_EQ(BitUtil::RoundUp(6, 2), 6); - EXPECT_EQ(BitUtil::RoundUp(7, 3), 9); - EXPECT_EQ(BitUtil::RoundUp(9, 9), 9); - EXPECT_EQ(BitUtil::RoundUp(10000000001, 10), 10000000010); - EXPECT_EQ(BitUtil::RoundUp(10, 10000000000), 10000000000); - EXPECT_EQ(BitUtil::RoundUp(100000000000, 10000000000), 100000000000); -} - -TEST(BitUtil, RoundDown) { - EXPECT_EQ(BitUtil::RoundDown(0, 1), 0); - EXPECT_EQ(BitUtil::RoundDown(1, 1), 1); - EXPECT_EQ(BitUtil::RoundDown(1, 2), 0); - EXPECT_EQ(BitUtil::RoundDown(6, 2), 6); - EXPECT_EQ(BitUtil::RoundDown(7, 3), 6); - EXPECT_EQ(BitUtil::RoundDown(9, 9), 9); - EXPECT_EQ(BitUtil::RoundDown(10000000001, 10), 10000000000); - EXPECT_EQ(BitUtil::RoundDown(10, 10000000000), 0); - EXPECT_EQ(BitUtil::RoundDown(100000000000, 10000000000), 100000000000); -} - -TEST(BitUtil, Popcount) { - ensure_cpu_info_initialized(); - - EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4); - EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4); - EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6); - EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6); - EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8); - EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8); - EXPECT_EQ(BitUtil::Popcount(0), 0); - EXPECT_EQ(BitUtil::PopcountNoHw(0), 0); -} - -TEST(BitUtil, TrailingBits) { - EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 0), 0); - EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 1), 1); - EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 64), - BOOST_BINARY(1 1 1 1 1 1 1 1)); - EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 100), - BOOST_BINARY(1 1 1 1 1 1 1 1)); - EXPECT_EQ(BitUtil::TrailingBits(0, 1), 0); - EXPECT_EQ(BitUtil::TrailingBits(0, 64), 0); - EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 0), 0); - EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 63), 0); - EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 64), 1LL << 63); -} - -TEST(BitUtil, ByteSwap) { - EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); - EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x11223344)), 0x44332211); - - EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); - EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x11223344)), 0x44332211); - - EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); - EXPECT_EQ( - BitUtil::ByteSwap(static_cast(0x1122334455667788)), 0x8877665544332211); - - EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); - EXPECT_EQ( - BitUtil::ByteSwap(static_cast(0x1122334455667788)), 0x8877665544332211); - - EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); - EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x1122)), 0x2211); - - EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); - EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x1122)), 0x2211); -} - -TEST(BitUtil, Log2) { - EXPECT_EQ(BitUtil::Log2(1), 0); - EXPECT_EQ(BitUtil::Log2(2), 1); - EXPECT_EQ(BitUtil::Log2(3), 2); - EXPECT_EQ(BitUtil::Log2(4), 2); - EXPECT_EQ(BitUtil::Log2(5), 3); - EXPECT_EQ(BitUtil::Log2(INT_MAX), 31); - EXPECT_EQ(BitUtil::Log2(UINT_MAX), 32); - EXPECT_EQ(BitUtil::Log2(ULLONG_MAX), 64); -} - -TEST(BitUtil, RoundUpToPowerOf2) { - EXPECT_EQ(BitUtil::RoundUpToPowerOf2(7, 8), 8); - EXPECT_EQ(BitUtil::RoundUpToPowerOf2(8, 8), 8); - EXPECT_EQ(BitUtil::RoundUpToPowerOf2(9, 8), 16); -} - -TEST(BitUtil, RoundDownToPowerOf2) { - EXPECT_EQ(BitUtil::RoundDownToPowerOf2(7, 8), 0); - EXPECT_EQ(BitUtil::RoundDownToPowerOf2(8, 8), 8); - EXPECT_EQ(BitUtil::RoundDownToPowerOf2(9, 8), 8); -} - -TEST(BitUtil, RoundUpDown) { - EXPECT_EQ(BitUtil::RoundUpNumBytes(7), 1); - EXPECT_EQ(BitUtil::RoundUpNumBytes(8), 1); - EXPECT_EQ(BitUtil::RoundUpNumBytes(9), 2); - EXPECT_EQ(BitUtil::RoundDownNumBytes(7), 0); - EXPECT_EQ(BitUtil::RoundDownNumBytes(8), 1); - EXPECT_EQ(BitUtil::RoundDownNumBytes(9), 1); - - EXPECT_EQ(BitUtil::RoundUpNumi32(31), 1); - EXPECT_EQ(BitUtil::RoundUpNumi32(32), 1); - EXPECT_EQ(BitUtil::RoundUpNumi32(33), 2); - EXPECT_EQ(BitUtil::RoundDownNumi32(31), 0); - EXPECT_EQ(BitUtil::RoundDownNumi32(32), 1); - EXPECT_EQ(BitUtil::RoundDownNumi32(33), 1); - - EXPECT_EQ(BitUtil::RoundUpNumi64(63), 1); - EXPECT_EQ(BitUtil::RoundUpNumi64(64), 1); - EXPECT_EQ(BitUtil::RoundUpNumi64(65), 2); - EXPECT_EQ(BitUtil::RoundDownNumi64(63), 0); - EXPECT_EQ(BitUtil::RoundDownNumi64(64), 1); - EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1); -} - -void TestZigZag(int32_t v) { - uint8_t buffer[BitReader::MAX_VLQ_BYTE_LEN]; - BitWriter writer(buffer, sizeof(buffer)); - BitReader reader(buffer, sizeof(buffer)); - writer.PutZigZagVlqInt(v); - int32_t result; - EXPECT_TRUE(reader.GetZigZagVlqInt(&result)); - EXPECT_EQ(v, result); -} - -TEST(BitStreamUtil, ZigZag) { - TestZigZag(0); - TestZigZag(1); - TestZigZag(-1); - TestZigZag(std::numeric_limits::max()); - TestZigZag(-std::numeric_limits::max()); -} - -} // namespace parquet diff --git a/src/parquet/util/bit-util.h b/src/parquet/util/bit-util.h deleted file mode 100644 index e315b5f4..00000000 --- a/src/parquet/util/bit-util.h +++ /dev/null @@ -1,340 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29 - -#ifndef PARQUET_UTIL_BIT_UTIL_H -#define PARQUET_UTIL_BIT_UTIL_H - -#if defined(__APPLE__) -#include -#elif defined(_WIN32) -#define __LITTLE_ENDIAN 1 -#else -#include -#endif - -#if defined(_MSC_VER) -#define PARQUET_BYTE_SWAP64 _byteswap_uint64 -#define PARQUET_BYTE_SWAP32 _byteswap_ulong -#else -#define PARQUET_BYTE_SWAP64 __builtin_bswap64 -#define PARQUET_BYTE_SWAP32 __builtin_bswap32 -#endif - -#include - -#include "parquet/util/compiler-util.h" - -#ifdef PARQUET_USE_SSE -#include "parquet/util/cpu-info.h" -#include "parquet/util/sse-util.h" -#endif - -namespace parquet { - -#define INIT_BITSET(valid_bits_vector, valid_bits_index) \ - int byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \ - int bit_offset_##valid_bits_vector = (valid_bits_index) % 8; \ - uint8_t bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector]; - -#define READ_NEXT_BITSET(valid_bits_vector) \ - bit_offset_##valid_bits_vector++; \ - if (bit_offset_##valid_bits_vector == 8) { \ - bit_offset_##valid_bits_vector = 0; \ - byte_offset_##valid_bits_vector++; \ - bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector]; \ - } - -// TODO(wesm): The source from Impala was depending on boost::make_unsigned -// -// We add a partial stub implementation here - -template -struct make_unsigned {}; - -template <> -struct make_unsigned { - typedef uint8_t type; -}; - -template <> -struct make_unsigned { - typedef uint16_t type; -}; - -template <> -struct make_unsigned { - typedef uint32_t type; -}; - -template <> -struct make_unsigned { - typedef uint64_t type; -}; - -/// Utility class to do standard bit tricks -class BitUtil { - public: - /// Returns the ceil of value/divisor - static inline int64_t Ceil(int64_t value, int64_t divisor) { - return value / divisor + (value % divisor != 0); - } - - /// Returns 'value' rounded up to the nearest multiple of 'factor' - static inline int64_t RoundUp(int64_t value, int64_t factor) { - return (value + (factor - 1)) / factor * factor; - } - - /// Returns 'value' rounded down to the nearest multiple of 'factor' - static inline int64_t RoundDown(int64_t value, int64_t factor) { - return (value / factor) * factor; - } - - /// Returns the smallest power of two that contains v. Taken from - /// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 - /// TODO: Pick a better name, as it is not clear what happens when the input is - /// already a power of two. - static inline int64_t NextPowerOfTwo(int64_t v) { - --v; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v |= v >> 32; - ++v; - return v; - } - - /// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is - /// a power of two - static inline int RoundUpToPowerOf2(int value, int factor) { - // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); - return (value + (factor - 1)) & ~(factor - 1); - } - - static inline int RoundDownToPowerOf2(int value, int factor) { - // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); - return value & ~(factor - 1); - } - - /// Specialized round up and down functions for frequently used factors, - /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64). - /// Returns the rounded up number of bytes that fit the number of bits. - static inline uint32_t RoundUpNumBytes(uint32_t bits) { return (bits + 7) >> 3; } - - /// Returns the rounded down number of bytes that fit the number of bits. - static inline uint32_t RoundDownNumBytes(uint32_t bits) { return bits >> 3; } - - /// Returns the rounded up to 32 multiple. Used for conversions of bits to i32. - static inline uint32_t RoundUpNumi32(uint32_t bits) { return (bits + 31) >> 5; } - - /// Returns the rounded up 32 multiple. - static inline uint32_t RoundDownNumi32(uint32_t bits) { return bits >> 5; } - - /// Returns the rounded up to 64 multiple. Used for conversions of bits to i64. - static inline uint32_t RoundUpNumi64(uint32_t bits) { return (bits + 63) >> 6; } - - /// Returns the rounded down to 64 multiple. - static inline uint32_t RoundDownNumi64(uint32_t bits) { return bits >> 6; } - - /// Non hw accelerated pop count. - /// TODO: we don't use this in any perf sensitive code paths currently. There - /// might be a much faster way to implement this. - static inline int PopcountNoHw(uint64_t x) { - int count = 0; - for (; x != 0; ++count) - x &= x - 1; - return count; - } - - /// Returns the number of set bits in x - static inline int Popcount(uint64_t x) { -#ifdef PARQUET_USE_SSE - if (LIKELY(CpuInfo::IsSupported(CpuInfo::POPCNT))) { - return POPCNT_popcnt_u64(x); - } else { - return PopcountNoHw(x); - } -#else - return PopcountNoHw(x); -#endif - } - - // Compute correct population count for various-width signed integers - template - static inline int PopcountSigned(T v) { - // Converting to same-width unsigned then extending preserves the bit pattern. - return BitUtil::Popcount(static_cast::type>(v)); - } - - /// Returns the 'num_bits' least-significant bits of 'v'. - static inline uint64_t TrailingBits(uint64_t v, int num_bits) { - if (UNLIKELY(num_bits == 0)) return 0; - if (UNLIKELY(num_bits >= 64)) return v; - int n = 64 - num_bits; - return (v << n) >> n; - } - - /// Returns ceil(log2(x)). - /// TODO: this could be faster if we use __builtin_clz. Fix this if this ever shows up - /// in a hot path. - static inline int Log2(uint64_t x) { - // DCHECK_GT(x, 0); - if (x == 1) return 0; - // Compute result = ceil(log2(x)) - // = floor(log2(x - 1)) + 1, for x > 1 - // by finding the position of the most significant bit (1-indexed) of x - 1 - // (floor(log2(n)) = MSB(n) (0-indexed)) - --x; - int result = 1; - while (x >>= 1) - ++result; - return result; - } - - /// Swaps the byte order (i.e. endianess) - static inline int64_t ByteSwap(int64_t value) { return PARQUET_BYTE_SWAP64(value); } - static inline uint64_t ByteSwap(uint64_t value) { - return static_cast(PARQUET_BYTE_SWAP64(value)); - } - static inline int32_t ByteSwap(int32_t value) { return PARQUET_BYTE_SWAP32(value); } - static inline uint32_t ByteSwap(uint32_t value) { - return static_cast(PARQUET_BYTE_SWAP32(value)); - } - static inline int16_t ByteSwap(int16_t value) { - return (((value >> 8) & 0xff) | ((value & 0xff) << 8)); - } - static inline uint16_t ByteSwap(uint16_t value) { - return static_cast(ByteSwap(static_cast(value))); - } - - /// Write the swapped bytes into dst. Src and st cannot overlap. - static inline void ByteSwap(void* dst, const void* src, int len) { - switch (len) { - case 1: - *reinterpret_cast(dst) = *reinterpret_cast(src); - return; - case 2: - *reinterpret_cast(dst) = - ByteSwap(*reinterpret_cast(src)); - return; - case 4: - *reinterpret_cast(dst) = - ByteSwap(*reinterpret_cast(src)); - return; - case 8: - *reinterpret_cast(dst) = - ByteSwap(*reinterpret_cast(src)); - return; - default: - break; - } - - uint8_t* d = reinterpret_cast(dst); - const uint8_t* s = reinterpret_cast(src); - for (int i = 0; i < len; ++i) { - d[i] = s[len - i - 1]; - } - } - -/// Converts to big endian format (if not already in big endian) from the -/// machine's native endian format. -#if __BYTE_ORDER == __LITTLE_ENDIAN - static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); } - static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); } - static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); } - static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); } - static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); } - static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); } -#else - static inline int64_t ToBigEndian(int64_t val) { return val; } - static inline uint64_t ToBigEndian(uint64_t val) { return val; } - static inline int32_t ToBigEndian(int32_t val) { return val; } - static inline uint32_t ToBigEndian(uint32_t val) { return val; } - static inline int16_t ToBigEndian(int16_t val) { return val; } - static inline uint16_t ToBigEndian(uint16_t val) { return val; } -#endif - -/// Converts from big endian format to the machine's native endian format. -#if __BYTE_ORDER == __LITTLE_ENDIAN - static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); } - static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); } - static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); } - static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); } - static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); } - static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); } -#else - static inline int64_t FromBigEndian(int64_t val) { return val; } - static inline uint64_t FromBigEndian(uint64_t val) { return val; } - static inline int32_t FromBigEndian(int32_t val) { return val; } - static inline uint32_t FromBigEndian(uint32_t val) { return val; } - static inline int16_t FromBigEndian(int16_t val) { return val; } - static inline uint16_t FromBigEndian(uint16_t val) { return val; } -#endif - - // Logical right shift for signed integer types - // This is needed because the C >> operator does arithmetic right shift - // Negative shift amounts lead to undefined behavior - template - static T ShiftRightLogical(T v, int shift) { - // Conversion to unsigned ensures most significant bits always filled with 0's - return static_cast::type>(v) >> shift; - } - - // Get an specific bit of a numeric type - template - static inline int8_t GetBit(T v, int bitpos) { - T masked = v & (static_cast(0x1) << bitpos); - return static_cast(ShiftRightLogical(masked, bitpos)); - } - - // Set a specific bit to 1 - // Behavior when bitpos is negative is undefined - template - static T SetBit(T v, int bitpos) { - return v | (static_cast(0x1) << bitpos); - } - - static inline bool GetArrayBit(const uint8_t* bits, int i) { - return (bits[i / 8] & (1 << (i % 8))) != 0; - } - - static inline void SetArrayBit(uint8_t* bits, int i, bool is_set) { - bits[i / 8] |= (1 << (i % 8)) * is_set; - } - - // Set a specific bit to 0 - // Behavior when bitpos is negative is undefined - template - static T UnsetBit(T v, int bitpos) { - return v & ~(static_cast(0x1) << bitpos); - } - - // Returns the minimum number of bits needed to represent the value of 'x' - static inline int NumRequiredBits(uint64_t x) { - for (int i = 63; i >= 0; --i) { - if (x & (UINT64_C(1) << i)) return i + 1; - } - return 0; - } -}; - -} // namespace parquet - -#endif // PARQUET_UTIL_BIT_UTIL_H diff --git a/src/parquet/util/bpacking.h b/src/parquet/util/bpacking.h deleted file mode 100644 index aa6fdab6..00000000 --- a/src/parquet/util/bpacking.h +++ /dev/null @@ -1,3342 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This file was modified from its original version for inclusion in parquet-cpp. -// Original source: -// https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp -// The original copyright notice follows. - -/** -* -* This code is released under the -* Apache License Version 2.0 http://www.apache.org/licenses/. -* (c) Daniel Lemire 2013 -*/ - -#ifndef PARQUET_UTIL_BPACKING_H -#define PARQUET_UTIL_BPACKING_H - -#include - -namespace parquet { - -inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) & 1; - out++; - *out = ((*in) >> 1) & 1; - out++; - *out = ((*in) >> 2) & 1; - out++; - *out = ((*in) >> 3) & 1; - out++; - *out = ((*in) >> 4) & 1; - out++; - *out = ((*in) >> 5) & 1; - out++; - *out = ((*in) >> 6) & 1; - out++; - *out = ((*in) >> 7) & 1; - out++; - *out = ((*in) >> 8) & 1; - out++; - *out = ((*in) >> 9) & 1; - out++; - *out = ((*in) >> 10) & 1; - out++; - *out = ((*in) >> 11) & 1; - out++; - *out = ((*in) >> 12) & 1; - out++; - *out = ((*in) >> 13) & 1; - out++; - *out = ((*in) >> 14) & 1; - out++; - *out = ((*in) >> 15) & 1; - out++; - *out = ((*in) >> 16) & 1; - out++; - *out = ((*in) >> 17) & 1; - out++; - *out = ((*in) >> 18) & 1; - out++; - *out = ((*in) >> 19) & 1; - out++; - *out = ((*in) >> 20) & 1; - out++; - *out = ((*in) >> 21) & 1; - out++; - *out = ((*in) >> 22) & 1; - out++; - *out = ((*in) >> 23) & 1; - out++; - *out = ((*in) >> 24) & 1; - out++; - *out = ((*in) >> 25) & 1; - out++; - *out = ((*in) >> 26) & 1; - out++; - *out = ((*in) >> 27) & 1; - out++; - *out = ((*in) >> 28) & 1; - out++; - *out = ((*in) >> 29) & 1; - out++; - *out = ((*in) >> 30) & 1; - out++; - *out = ((*in) >> 31); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 2); - out++; - *out = ((*in) >> 2) % (1U << 2); - out++; - *out = ((*in) >> 4) % (1U << 2); - out++; - *out = ((*in) >> 6) % (1U << 2); - out++; - *out = ((*in) >> 8) % (1U << 2); - out++; - *out = ((*in) >> 10) % (1U << 2); - out++; - *out = ((*in) >> 12) % (1U << 2); - out++; - *out = ((*in) >> 14) % (1U << 2); - out++; - *out = ((*in) >> 16) % (1U << 2); - out++; - *out = ((*in) >> 18) % (1U << 2); - out++; - *out = ((*in) >> 20) % (1U << 2); - out++; - *out = ((*in) >> 22) % (1U << 2); - out++; - *out = ((*in) >> 24) % (1U << 2); - out++; - *out = ((*in) >> 26) % (1U << 2); - out++; - *out = ((*in) >> 28) % (1U << 2); - out++; - *out = ((*in) >> 30); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 2); - out++; - *out = ((*in) >> 2) % (1U << 2); - out++; - *out = ((*in) >> 4) % (1U << 2); - out++; - *out = ((*in) >> 6) % (1U << 2); - out++; - *out = ((*in) >> 8) % (1U << 2); - out++; - *out = ((*in) >> 10) % (1U << 2); - out++; - *out = ((*in) >> 12) % (1U << 2); - out++; - *out = ((*in) >> 14) % (1U << 2); - out++; - *out = ((*in) >> 16) % (1U << 2); - out++; - *out = ((*in) >> 18) % (1U << 2); - out++; - *out = ((*in) >> 20) % (1U << 2); - out++; - *out = ((*in) >> 22) % (1U << 2); - out++; - *out = ((*in) >> 24) % (1U << 2); - out++; - *out = ((*in) >> 26) % (1U << 2); - out++; - *out = ((*in) >> 28) % (1U << 2); - out++; - *out = ((*in) >> 30); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 3); - out++; - *out = ((*in) >> 3) % (1U << 3); - out++; - *out = ((*in) >> 6) % (1U << 3); - out++; - *out = ((*in) >> 9) % (1U << 3); - out++; - *out = ((*in) >> 12) % (1U << 3); - out++; - *out = ((*in) >> 15) % (1U << 3); - out++; - *out = ((*in) >> 18) % (1U << 3); - out++; - *out = ((*in) >> 21) % (1U << 3); - out++; - *out = ((*in) >> 24) % (1U << 3); - out++; - *out = ((*in) >> 27) % (1U << 3); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 1)) << (3 - 1); - out++; - *out = ((*in) >> 1) % (1U << 3); - out++; - *out = ((*in) >> 4) % (1U << 3); - out++; - *out = ((*in) >> 7) % (1U << 3); - out++; - *out = ((*in) >> 10) % (1U << 3); - out++; - *out = ((*in) >> 13) % (1U << 3); - out++; - *out = ((*in) >> 16) % (1U << 3); - out++; - *out = ((*in) >> 19) % (1U << 3); - out++; - *out = ((*in) >> 22) % (1U << 3); - out++; - *out = ((*in) >> 25) % (1U << 3); - out++; - *out = ((*in) >> 28) % (1U << 3); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 2)) << (3 - 2); - out++; - *out = ((*in) >> 2) % (1U << 3); - out++; - *out = ((*in) >> 5) % (1U << 3); - out++; - *out = ((*in) >> 8) % (1U << 3); - out++; - *out = ((*in) >> 11) % (1U << 3); - out++; - *out = ((*in) >> 14) % (1U << 3); - out++; - *out = ((*in) >> 17) % (1U << 3); - out++; - *out = ((*in) >> 20) % (1U << 3); - out++; - *out = ((*in) >> 23) % (1U << 3); - out++; - *out = ((*in) >> 26) % (1U << 3); - out++; - *out = ((*in) >> 29); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 4); - out++; - *out = ((*in) >> 4) % (1U << 4); - out++; - *out = ((*in) >> 8) % (1U << 4); - out++; - *out = ((*in) >> 12) % (1U << 4); - out++; - *out = ((*in) >> 16) % (1U << 4); - out++; - *out = ((*in) >> 20) % (1U << 4); - out++; - *out = ((*in) >> 24) % (1U << 4); - out++; - *out = ((*in) >> 28); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 4); - out++; - *out = ((*in) >> 4) % (1U << 4); - out++; - *out = ((*in) >> 8) % (1U << 4); - out++; - *out = ((*in) >> 12) % (1U << 4); - out++; - *out = ((*in) >> 16) % (1U << 4); - out++; - *out = ((*in) >> 20) % (1U << 4); - out++; - *out = ((*in) >> 24) % (1U << 4); - out++; - *out = ((*in) >> 28); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 4); - out++; - *out = ((*in) >> 4) % (1U << 4); - out++; - *out = ((*in) >> 8) % (1U << 4); - out++; - *out = ((*in) >> 12) % (1U << 4); - out++; - *out = ((*in) >> 16) % (1U << 4); - out++; - *out = ((*in) >> 20) % (1U << 4); - out++; - *out = ((*in) >> 24) % (1U << 4); - out++; - *out = ((*in) >> 28); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 4); - out++; - *out = ((*in) >> 4) % (1U << 4); - out++; - *out = ((*in) >> 8) % (1U << 4); - out++; - *out = ((*in) >> 12) % (1U << 4); - out++; - *out = ((*in) >> 16) % (1U << 4); - out++; - *out = ((*in) >> 20) % (1U << 4); - out++; - *out = ((*in) >> 24) % (1U << 4); - out++; - *out = ((*in) >> 28); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 5); - out++; - *out = ((*in) >> 5) % (1U << 5); - out++; - *out = ((*in) >> 10) % (1U << 5); - out++; - *out = ((*in) >> 15) % (1U << 5); - out++; - *out = ((*in) >> 20) % (1U << 5); - out++; - *out = ((*in) >> 25) % (1U << 5); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 3)) << (5 - 3); - out++; - *out = ((*in) >> 3) % (1U << 5); - out++; - *out = ((*in) >> 8) % (1U << 5); - out++; - *out = ((*in) >> 13) % (1U << 5); - out++; - *out = ((*in) >> 18) % (1U << 5); - out++; - *out = ((*in) >> 23) % (1U << 5); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 1)) << (5 - 1); - out++; - *out = ((*in) >> 1) % (1U << 5); - out++; - *out = ((*in) >> 6) % (1U << 5); - out++; - *out = ((*in) >> 11) % (1U << 5); - out++; - *out = ((*in) >> 16) % (1U << 5); - out++; - *out = ((*in) >> 21) % (1U << 5); - out++; - *out = ((*in) >> 26) % (1U << 5); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 4)) << (5 - 4); - out++; - *out = ((*in) >> 4) % (1U << 5); - out++; - *out = ((*in) >> 9) % (1U << 5); - out++; - *out = ((*in) >> 14) % (1U << 5); - out++; - *out = ((*in) >> 19) % (1U << 5); - out++; - *out = ((*in) >> 24) % (1U << 5); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 2)) << (5 - 2); - out++; - *out = ((*in) >> 2) % (1U << 5); - out++; - *out = ((*in) >> 7) % (1U << 5); - out++; - *out = ((*in) >> 12) % (1U << 5); - out++; - *out = ((*in) >> 17) % (1U << 5); - out++; - *out = ((*in) >> 22) % (1U << 5); - out++; - *out = ((*in) >> 27); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 6); - out++; - *out = ((*in) >> 6) % (1U << 6); - out++; - *out = ((*in) >> 12) % (1U << 6); - out++; - *out = ((*in) >> 18) % (1U << 6); - out++; - *out = ((*in) >> 24) % (1U << 6); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 4)) << (6 - 4); - out++; - *out = ((*in) >> 4) % (1U << 6); - out++; - *out = ((*in) >> 10) % (1U << 6); - out++; - *out = ((*in) >> 16) % (1U << 6); - out++; - *out = ((*in) >> 22) % (1U << 6); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 2)) << (6 - 2); - out++; - *out = ((*in) >> 2) % (1U << 6); - out++; - *out = ((*in) >> 8) % (1U << 6); - out++; - *out = ((*in) >> 14) % (1U << 6); - out++; - *out = ((*in) >> 20) % (1U << 6); - out++; - *out = ((*in) >> 26); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 6); - out++; - *out = ((*in) >> 6) % (1U << 6); - out++; - *out = ((*in) >> 12) % (1U << 6); - out++; - *out = ((*in) >> 18) % (1U << 6); - out++; - *out = ((*in) >> 24) % (1U << 6); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 4)) << (6 - 4); - out++; - *out = ((*in) >> 4) % (1U << 6); - out++; - *out = ((*in) >> 10) % (1U << 6); - out++; - *out = ((*in) >> 16) % (1U << 6); - out++; - *out = ((*in) >> 22) % (1U << 6); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 2)) << (6 - 2); - out++; - *out = ((*in) >> 2) % (1U << 6); - out++; - *out = ((*in) >> 8) % (1U << 6); - out++; - *out = ((*in) >> 14) % (1U << 6); - out++; - *out = ((*in) >> 20) % (1U << 6); - out++; - *out = ((*in) >> 26); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 7); - out++; - *out = ((*in) >> 7) % (1U << 7); - out++; - *out = ((*in) >> 14) % (1U << 7); - out++; - *out = ((*in) >> 21) % (1U << 7); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 3)) << (7 - 3); - out++; - *out = ((*in) >> 3) % (1U << 7); - out++; - *out = ((*in) >> 10) % (1U << 7); - out++; - *out = ((*in) >> 17) % (1U << 7); - out++; - *out = ((*in) >> 24) % (1U << 7); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 6)) << (7 - 6); - out++; - *out = ((*in) >> 6) % (1U << 7); - out++; - *out = ((*in) >> 13) % (1U << 7); - out++; - *out = ((*in) >> 20) % (1U << 7); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 2)) << (7 - 2); - out++; - *out = ((*in) >> 2) % (1U << 7); - out++; - *out = ((*in) >> 9) % (1U << 7); - out++; - *out = ((*in) >> 16) % (1U << 7); - out++; - *out = ((*in) >> 23) % (1U << 7); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 5)) << (7 - 5); - out++; - *out = ((*in) >> 5) % (1U << 7); - out++; - *out = ((*in) >> 12) % (1U << 7); - out++; - *out = ((*in) >> 19) % (1U << 7); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 1)) << (7 - 1); - out++; - *out = ((*in) >> 1) % (1U << 7); - out++; - *out = ((*in) >> 8) % (1U << 7); - out++; - *out = ((*in) >> 15) % (1U << 7); - out++; - *out = ((*in) >> 22) % (1U << 7); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 4)) << (7 - 4); - out++; - *out = ((*in) >> 4) % (1U << 7); - out++; - *out = ((*in) >> 11) % (1U << 7); - out++; - *out = ((*in) >> 18) % (1U << 7); - out++; - *out = ((*in) >> 25); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 9); - out++; - *out = ((*in) >> 9) % (1U << 9); - out++; - *out = ((*in) >> 18) % (1U << 9); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 4)) << (9 - 4); - out++; - *out = ((*in) >> 4) % (1U << 9); - out++; - *out = ((*in) >> 13) % (1U << 9); - out++; - *out = ((*in) >> 22) % (1U << 9); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 8)) << (9 - 8); - out++; - *out = ((*in) >> 8) % (1U << 9); - out++; - *out = ((*in) >> 17) % (1U << 9); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 3)) << (9 - 3); - out++; - *out = ((*in) >> 3) % (1U << 9); - out++; - *out = ((*in) >> 12) % (1U << 9); - out++; - *out = ((*in) >> 21) % (1U << 9); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 7)) << (9 - 7); - out++; - *out = ((*in) >> 7) % (1U << 9); - out++; - *out = ((*in) >> 16) % (1U << 9); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 2)) << (9 - 2); - out++; - *out = ((*in) >> 2) % (1U << 9); - out++; - *out = ((*in) >> 11) % (1U << 9); - out++; - *out = ((*in) >> 20) % (1U << 9); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 6)) << (9 - 6); - out++; - *out = ((*in) >> 6) % (1U << 9); - out++; - *out = ((*in) >> 15) % (1U << 9); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 1)) << (9 - 1); - out++; - *out = ((*in) >> 1) % (1U << 9); - out++; - *out = ((*in) >> 10) % (1U << 9); - out++; - *out = ((*in) >> 19) % (1U << 9); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 5)) << (9 - 5); - out++; - *out = ((*in) >> 5) % (1U << 9); - out++; - *out = ((*in) >> 14) % (1U << 9); - out++; - *out = ((*in) >> 23); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 10); - out++; - *out = ((*in) >> 10) % (1U << 10); - out++; - *out = ((*in) >> 20) % (1U << 10); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 8)) << (10 - 8); - out++; - *out = ((*in) >> 8) % (1U << 10); - out++; - *out = ((*in) >> 18) % (1U << 10); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 6)) << (10 - 6); - out++; - *out = ((*in) >> 6) % (1U << 10); - out++; - *out = ((*in) >> 16) % (1U << 10); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 4)) << (10 - 4); - out++; - *out = ((*in) >> 4) % (1U << 10); - out++; - *out = ((*in) >> 14) % (1U << 10); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 2)) << (10 - 2); - out++; - *out = ((*in) >> 2) % (1U << 10); - out++; - *out = ((*in) >> 12) % (1U << 10); - out++; - *out = ((*in) >> 22); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 10); - out++; - *out = ((*in) >> 10) % (1U << 10); - out++; - *out = ((*in) >> 20) % (1U << 10); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 8)) << (10 - 8); - out++; - *out = ((*in) >> 8) % (1U << 10); - out++; - *out = ((*in) >> 18) % (1U << 10); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 6)) << (10 - 6); - out++; - *out = ((*in) >> 6) % (1U << 10); - out++; - *out = ((*in) >> 16) % (1U << 10); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 4)) << (10 - 4); - out++; - *out = ((*in) >> 4) % (1U << 10); - out++; - *out = ((*in) >> 14) % (1U << 10); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 2)) << (10 - 2); - out++; - *out = ((*in) >> 2) % (1U << 10); - out++; - *out = ((*in) >> 12) % (1U << 10); - out++; - *out = ((*in) >> 22); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 11); - out++; - *out = ((*in) >> 11) % (1U << 11); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 1)) << (11 - 1); - out++; - *out = ((*in) >> 1) % (1U << 11); - out++; - *out = ((*in) >> 12) % (1U << 11); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 2)) << (11 - 2); - out++; - *out = ((*in) >> 2) % (1U << 11); - out++; - *out = ((*in) >> 13) % (1U << 11); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 3)) << (11 - 3); - out++; - *out = ((*in) >> 3) % (1U << 11); - out++; - *out = ((*in) >> 14) % (1U << 11); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 4)) << (11 - 4); - out++; - *out = ((*in) >> 4) % (1U << 11); - out++; - *out = ((*in) >> 15) % (1U << 11); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 5)) << (11 - 5); - out++; - *out = ((*in) >> 5) % (1U << 11); - out++; - *out = ((*in) >> 16) % (1U << 11); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 6)) << (11 - 6); - out++; - *out = ((*in) >> 6) % (1U << 11); - out++; - *out = ((*in) >> 17) % (1U << 11); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 7)) << (11 - 7); - out++; - *out = ((*in) >> 7) % (1U << 11); - out++; - *out = ((*in) >> 18) % (1U << 11); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 8)) << (11 - 8); - out++; - *out = ((*in) >> 8) % (1U << 11); - out++; - *out = ((*in) >> 19) % (1U << 11); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 9)) << (11 - 9); - out++; - *out = ((*in) >> 9) % (1U << 11); - out++; - *out = ((*in) >> 20) % (1U << 11); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 10)) << (11 - 10); - out++; - *out = ((*in) >> 10) % (1U << 11); - out++; - *out = ((*in) >> 21); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 12); - out++; - *out = ((*in) >> 12) % (1U << 12); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12); - out++; - *out = ((*in) >> 16) % (1U << 12); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12); - out++; - *out = ((*in) >> 20); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 12); - out++; - *out = ((*in) >> 12) % (1U << 12); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12); - out++; - *out = ((*in) >> 16) % (1U << 12); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12); - out++; - *out = ((*in) >> 20); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 12); - out++; - *out = ((*in) >> 12) % (1U << 12); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12); - out++; - *out = ((*in) >> 16) % (1U << 12); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12); - out++; - *out = ((*in) >> 20); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 12); - out++; - *out = ((*in) >> 12) % (1U << 12); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12); - out++; - *out = ((*in) >> 16) % (1U << 12); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12); - out++; - *out = ((*in) >> 20); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 13); - out++; - *out = ((*in) >> 13) % (1U << 13); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 7)) << (13 - 7); - out++; - *out = ((*in) >> 7) % (1U << 13); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 1)) << (13 - 1); - out++; - *out = ((*in) >> 1) % (1U << 13); - out++; - *out = ((*in) >> 14) % (1U << 13); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 8)) << (13 - 8); - out++; - *out = ((*in) >> 8) % (1U << 13); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 2)) << (13 - 2); - out++; - *out = ((*in) >> 2) % (1U << 13); - out++; - *out = ((*in) >> 15) % (1U << 13); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 9)) << (13 - 9); - out++; - *out = ((*in) >> 9) % (1U << 13); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 3)) << (13 - 3); - out++; - *out = ((*in) >> 3) % (1U << 13); - out++; - *out = ((*in) >> 16) % (1U << 13); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 10)) << (13 - 10); - out++; - *out = ((*in) >> 10) % (1U << 13); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 4)) << (13 - 4); - out++; - *out = ((*in) >> 4) % (1U << 13); - out++; - *out = ((*in) >> 17) % (1U << 13); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 11)) << (13 - 11); - out++; - *out = ((*in) >> 11) % (1U << 13); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 5)) << (13 - 5); - out++; - *out = ((*in) >> 5) % (1U << 13); - out++; - *out = ((*in) >> 18) % (1U << 13); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 12)) << (13 - 12); - out++; - *out = ((*in) >> 12) % (1U << 13); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 6)) << (13 - 6); - out++; - *out = ((*in) >> 6) % (1U << 13); - out++; - *out = ((*in) >> 19); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 14); - out++; - *out = ((*in) >> 14) % (1U << 14); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 10)) << (14 - 10); - out++; - *out = ((*in) >> 10) % (1U << 14); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 6)) << (14 - 6); - out++; - *out = ((*in) >> 6) % (1U << 14); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 2)) << (14 - 2); - out++; - *out = ((*in) >> 2) % (1U << 14); - out++; - *out = ((*in) >> 16) % (1U << 14); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 12)) << (14 - 12); - out++; - *out = ((*in) >> 12) % (1U << 14); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 8)) << (14 - 8); - out++; - *out = ((*in) >> 8) % (1U << 14); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 4)) << (14 - 4); - out++; - *out = ((*in) >> 4) % (1U << 14); - out++; - *out = ((*in) >> 18); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 14); - out++; - *out = ((*in) >> 14) % (1U << 14); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 10)) << (14 - 10); - out++; - *out = ((*in) >> 10) % (1U << 14); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 6)) << (14 - 6); - out++; - *out = ((*in) >> 6) % (1U << 14); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 2)) << (14 - 2); - out++; - *out = ((*in) >> 2) % (1U << 14); - out++; - *out = ((*in) >> 16) % (1U << 14); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 12)) << (14 - 12); - out++; - *out = ((*in) >> 12) % (1U << 14); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 8)) << (14 - 8); - out++; - *out = ((*in) >> 8) % (1U << 14); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 4)) << (14 - 4); - out++; - *out = ((*in) >> 4) % (1U << 14); - out++; - *out = ((*in) >> 18); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 15); - out++; - *out = ((*in) >> 15) % (1U << 15); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 13)) << (15 - 13); - out++; - *out = ((*in) >> 13) % (1U << 15); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 11)) << (15 - 11); - out++; - *out = ((*in) >> 11) % (1U << 15); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 9)) << (15 - 9); - out++; - *out = ((*in) >> 9) % (1U << 15); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 7)) << (15 - 7); - out++; - *out = ((*in) >> 7) % (1U << 15); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 5)) << (15 - 5); - out++; - *out = ((*in) >> 5) % (1U << 15); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 3)) << (15 - 3); - out++; - *out = ((*in) >> 3) % (1U << 15); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 1)) << (15 - 1); - out++; - *out = ((*in) >> 1) % (1U << 15); - out++; - *out = ((*in) >> 16) % (1U << 15); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 14)) << (15 - 14); - out++; - *out = ((*in) >> 14) % (1U << 15); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 12)) << (15 - 12); - out++; - *out = ((*in) >> 12) % (1U << 15); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 10)) << (15 - 10); - out++; - *out = ((*in) >> 10) % (1U << 15); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 8)) << (15 - 8); - out++; - *out = ((*in) >> 8) % (1U << 15); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 6)) << (15 - 6); - out++; - *out = ((*in) >> 6) % (1U << 15); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 4)) << (15 - 4); - out++; - *out = ((*in) >> 4) % (1U << 15); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 2)) << (15 - 2); - out++; - *out = ((*in) >> 2) % (1U << 15); - out++; - *out = ((*in) >> 17); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 2)) << (17 - 2); - out++; - *out = ((*in) >> 2) % (1U << 17); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 4)) << (17 - 4); - out++; - *out = ((*in) >> 4) % (1U << 17); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 6)) << (17 - 6); - out++; - *out = ((*in) >> 6) % (1U << 17); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 8)) << (17 - 8); - out++; - *out = ((*in) >> 8) % (1U << 17); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 10)) << (17 - 10); - out++; - *out = ((*in) >> 10) % (1U << 17); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 12)) << (17 - 12); - out++; - *out = ((*in) >> 12) % (1U << 17); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 14)) << (17 - 14); - out++; - *out = ((*in) >> 14) % (1U << 17); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 16)) << (17 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 1)) << (17 - 1); - out++; - *out = ((*in) >> 1) % (1U << 17); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 3)) << (17 - 3); - out++; - *out = ((*in) >> 3) % (1U << 17); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 5)) << (17 - 5); - out++; - *out = ((*in) >> 5) % (1U << 17); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 7)) << (17 - 7); - out++; - *out = ((*in) >> 7) % (1U << 17); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 9)) << (17 - 9); - out++; - *out = ((*in) >> 9) % (1U << 17); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 11)) << (17 - 11); - out++; - *out = ((*in) >> 11) % (1U << 17); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 13)) << (17 - 13); - out++; - *out = ((*in) >> 13) % (1U << 17); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 15)) << (17 - 15); - out++; - *out = ((*in) >> 15); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 4)) << (18 - 4); - out++; - *out = ((*in) >> 4) % (1U << 18); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 8)) << (18 - 8); - out++; - *out = ((*in) >> 8) % (1U << 18); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 12)) << (18 - 12); - out++; - *out = ((*in) >> 12) % (1U << 18); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 16)) << (18 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 2)) << (18 - 2); - out++; - *out = ((*in) >> 2) % (1U << 18); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 6)) << (18 - 6); - out++; - *out = ((*in) >> 6) % (1U << 18); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 10)) << (18 - 10); - out++; - *out = ((*in) >> 10) % (1U << 18); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 14)) << (18 - 14); - out++; - *out = ((*in) >> 14); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 4)) << (18 - 4); - out++; - *out = ((*in) >> 4) % (1U << 18); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 8)) << (18 - 8); - out++; - *out = ((*in) >> 8) % (1U << 18); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 12)) << (18 - 12); - out++; - *out = ((*in) >> 12) % (1U << 18); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 16)) << (18 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 2)) << (18 - 2); - out++; - *out = ((*in) >> 2) % (1U << 18); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 6)) << (18 - 6); - out++; - *out = ((*in) >> 6) % (1U << 18); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 10)) << (18 - 10); - out++; - *out = ((*in) >> 10) % (1U << 18); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 14)) << (18 - 14); - out++; - *out = ((*in) >> 14); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 6)) << (19 - 6); - out++; - *out = ((*in) >> 6) % (1U << 19); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 12)) << (19 - 12); - out++; - *out = ((*in) >> 12) % (1U << 19); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 18)) << (19 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 5)) << (19 - 5); - out++; - *out = ((*in) >> 5) % (1U << 19); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 11)) << (19 - 11); - out++; - *out = ((*in) >> 11) % (1U << 19); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 17)) << (19 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 4)) << (19 - 4); - out++; - *out = ((*in) >> 4) % (1U << 19); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 10)) << (19 - 10); - out++; - *out = ((*in) >> 10) % (1U << 19); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 16)) << (19 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 3)) << (19 - 3); - out++; - *out = ((*in) >> 3) % (1U << 19); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 9)) << (19 - 9); - out++; - *out = ((*in) >> 9) % (1U << 19); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 15)) << (19 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 2)) << (19 - 2); - out++; - *out = ((*in) >> 2) % (1U << 19); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 8)) << (19 - 8); - out++; - *out = ((*in) >> 8) % (1U << 19); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 14)) << (19 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 1)) << (19 - 1); - out++; - *out = ((*in) >> 1) % (1U << 19); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 7)) << (19 - 7); - out++; - *out = ((*in) >> 7) % (1U << 19); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 13)) << (19 - 13); - out++; - *out = ((*in) >> 13); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 10)) << (21 - 10); - out++; - *out = ((*in) >> 10) % (1U << 21); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 20)) << (21 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 9)) << (21 - 9); - out++; - *out = ((*in) >> 9) % (1U << 21); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 19)) << (21 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 8)) << (21 - 8); - out++; - *out = ((*in) >> 8) % (1U << 21); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 18)) << (21 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 7)) << (21 - 7); - out++; - *out = ((*in) >> 7) % (1U << 21); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 17)) << (21 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 6)) << (21 - 6); - out++; - *out = ((*in) >> 6) % (1U << 21); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 16)) << (21 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 5)) << (21 - 5); - out++; - *out = ((*in) >> 5) % (1U << 21); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 15)) << (21 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 4)) << (21 - 4); - out++; - *out = ((*in) >> 4) % (1U << 21); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 14)) << (21 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 3)) << (21 - 3); - out++; - *out = ((*in) >> 3) % (1U << 21); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 13)) << (21 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 2)) << (21 - 2); - out++; - *out = ((*in) >> 2) % (1U << 21); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 12)) << (21 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 1)) << (21 - 1); - out++; - *out = ((*in) >> 1) % (1U << 21); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 11)) << (21 - 11); - out++; - *out = ((*in) >> 11); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 12)) << (22 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 2)) << (22 - 2); - out++; - *out = ((*in) >> 2) % (1U << 22); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 14)) << (22 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 4)) << (22 - 4); - out++; - *out = ((*in) >> 4) % (1U << 22); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 16)) << (22 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 6)) << (22 - 6); - out++; - *out = ((*in) >> 6) % (1U << 22); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 18)) << (22 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 8)) << (22 - 8); - out++; - *out = ((*in) >> 8) % (1U << 22); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 20)) << (22 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 10)) << (22 - 10); - out++; - *out = ((*in) >> 10); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 12)) << (22 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 2)) << (22 - 2); - out++; - *out = ((*in) >> 2) % (1U << 22); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 14)) << (22 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 4)) << (22 - 4); - out++; - *out = ((*in) >> 4) % (1U << 22); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 16)) << (22 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 6)) << (22 - 6); - out++; - *out = ((*in) >> 6) % (1U << 22); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 18)) << (22 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 8)) << (22 - 8); - out++; - *out = ((*in) >> 8) % (1U << 22); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 20)) << (22 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 10)) << (22 - 10); - out++; - *out = ((*in) >> 10); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 23); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 14)) << (23 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 5)) << (23 - 5); - out++; - *out = ((*in) >> 5) % (1U << 23); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 19)) << (23 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 10)) << (23 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 1)) << (23 - 1); - out++; - *out = ((*in) >> 1) % (1U << 23); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 15)) << (23 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 6)) << (23 - 6); - out++; - *out = ((*in) >> 6) % (1U << 23); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 20)) << (23 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 11)) << (23 - 11); - out++; - *out = ((*in) >> 11); - ++in; - *out |= ((*in) % (1U << 2)) << (23 - 2); - out++; - *out = ((*in) >> 2) % (1U << 23); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 16)) << (23 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 7)) << (23 - 7); - out++; - *out = ((*in) >> 7) % (1U << 23); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 21)) << (23 - 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 12)) << (23 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 3)) << (23 - 3); - out++; - *out = ((*in) >> 3) % (1U << 23); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 17)) << (23 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 8)) << (23 - 8); - out++; - *out = ((*in) >> 8) % (1U << 23); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 22)) << (23 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 13)) << (23 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 4)) << (23 - 4); - out++; - *out = ((*in) >> 4) % (1U << 23); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 18)) << (23 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 9)) << (23 - 9); - out++; - *out = ((*in) >> 9); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 25); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 18)) << (25 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 11)) << (25 - 11); - out++; - *out = ((*in) >> 11); - ++in; - *out |= ((*in) % (1U << 4)) << (25 - 4); - out++; - *out = ((*in) >> 4) % (1U << 25); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 22)) << (25 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 15)) << (25 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 8)) << (25 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 1)) << (25 - 1); - out++; - *out = ((*in) >> 1) % (1U << 25); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 19)) << (25 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 12)) << (25 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 5)) << (25 - 5); - out++; - *out = ((*in) >> 5) % (1U << 25); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 23)) << (25 - 23); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 16)) << (25 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 9)) << (25 - 9); - out++; - *out = ((*in) >> 9); - ++in; - *out |= ((*in) % (1U << 2)) << (25 - 2); - out++; - *out = ((*in) >> 2) % (1U << 25); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 20)) << (25 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 13)) << (25 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 6)) << (25 - 6); - out++; - *out = ((*in) >> 6) % (1U << 25); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 24)) << (25 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 17)) << (25 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 10)) << (25 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 3)) << (25 - 3); - out++; - *out = ((*in) >> 3) % (1U << 25); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 21)) << (25 - 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 14)) << (25 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 7)) << (25 - 7); - out++; - *out = ((*in) >> 7); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 20)) << (26 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 14)) << (26 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 8)) << (26 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 2)) << (26 - 2); - out++; - *out = ((*in) >> 2) % (1U << 26); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 22)) << (26 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 16)) << (26 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 10)) << (26 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 4)) << (26 - 4); - out++; - *out = ((*in) >> 4) % (1U << 26); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 24)) << (26 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 18)) << (26 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 12)) << (26 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 6)) << (26 - 6); - out++; - *out = ((*in) >> 6); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 20)) << (26 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 14)) << (26 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 8)) << (26 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 2)) << (26 - 2); - out++; - *out = ((*in) >> 2) % (1U << 26); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 22)) << (26 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 16)) << (26 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 10)) << (26 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 4)) << (26 - 4); - out++; - *out = ((*in) >> 4) % (1U << 26); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 24)) << (26 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 18)) << (26 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 12)) << (26 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 6)) << (26 - 6); - out++; - *out = ((*in) >> 6); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 27); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 22)) << (27 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 17)) << (27 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 12)) << (27 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 7)) << (27 - 7); - out++; - *out = ((*in) >> 7); - ++in; - *out |= ((*in) % (1U << 2)) << (27 - 2); - out++; - *out = ((*in) >> 2) % (1U << 27); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 24)) << (27 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 19)) << (27 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 14)) << (27 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 9)) << (27 - 9); - out++; - *out = ((*in) >> 9); - ++in; - *out |= ((*in) % (1U << 4)) << (27 - 4); - out++; - *out = ((*in) >> 4) % (1U << 27); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 26)) << (27 - 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 21)) << (27 - 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 16)) << (27 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 11)) << (27 - 11); - out++; - *out = ((*in) >> 11); - ++in; - *out |= ((*in) % (1U << 6)) << (27 - 6); - out++; - *out = ((*in) >> 6); - ++in; - *out |= ((*in) % (1U << 1)) << (27 - 1); - out++; - *out = ((*in) >> 1) % (1U << 27); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 23)) << (27 - 23); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 18)) << (27 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 13)) << (27 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 8)) << (27 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 3)) << (27 - 3); - out++; - *out = ((*in) >> 3) % (1U << 27); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 25)) << (27 - 25); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 20)) << (27 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 15)) << (27 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 10)) << (27 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 5)) << (27 - 5); - out++; - *out = ((*in) >> 5); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 29); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 26)) << (29 - 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 23)) << (29 - 23); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 20)) << (29 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 17)) << (29 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 14)) << (29 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 11)) << (29 - 11); - out++; - *out = ((*in) >> 11); - ++in; - *out |= ((*in) % (1U << 8)) << (29 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 5)) << (29 - 5); - out++; - *out = ((*in) >> 5); - ++in; - *out |= ((*in) % (1U << 2)) << (29 - 2); - out++; - *out = ((*in) >> 2) % (1U << 29); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 28)) << (29 - 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 25)) << (29 - 25); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 22)) << (29 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 19)) << (29 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 16)) << (29 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 13)) << (29 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 10)) << (29 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 7)) << (29 - 7); - out++; - *out = ((*in) >> 7); - ++in; - *out |= ((*in) % (1U << 4)) << (29 - 4); - out++; - *out = ((*in) >> 4); - ++in; - *out |= ((*in) % (1U << 1)) << (29 - 1); - out++; - *out = ((*in) >> 1) % (1U << 29); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 27)) << (29 - 27); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 24)) << (29 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 21)) << (29 - 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 18)) << (29 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 15)) << (29 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 12)) << (29 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 9)) << (29 - 9); - out++; - *out = ((*in) >> 9); - ++in; - *out |= ((*in) % (1U << 6)) << (29 - 6); - out++; - *out = ((*in) >> 6); - ++in; - *out |= ((*in) % (1U << 3)) << (29 - 3); - out++; - *out = ((*in) >> 3); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 30); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 28)) << (30 - 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 26)) << (30 - 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 24)) << (30 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 22)) << (30 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 20)) << (30 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 18)) << (30 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 16)) << (30 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 14)) << (30 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 12)) << (30 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 10)) << (30 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 8)) << (30 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 6)) << (30 - 6); - out++; - *out = ((*in) >> 6); - ++in; - *out |= ((*in) % (1U << 4)) << (30 - 4); - out++; - *out = ((*in) >> 4); - ++in; - *out |= ((*in) % (1U << 2)) << (30 - 2); - out++; - *out = ((*in) >> 2); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 30); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 28)) << (30 - 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 26)) << (30 - 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 24)) << (30 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 22)) << (30 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 20)) << (30 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 18)) << (30 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 16)) << (30 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 14)) << (30 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 12)) << (30 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 10)) << (30 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 8)) << (30 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 6)) << (30 - 6); - out++; - *out = ((*in) >> 6); - ++in; - *out |= ((*in) % (1U << 4)) << (30 - 4); - out++; - *out = ((*in) >> 4); - ++in; - *out |= ((*in) % (1U << 2)) << (30 - 2); - out++; - *out = ((*in) >> 2); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 31); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 30)) << (31 - 30); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 29)) << (31 - 29); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 28)) << (31 - 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 27)) << (31 - 27); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 26)) << (31 - 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 25)) << (31 - 25); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 24)) << (31 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 23)) << (31 - 23); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 22)) << (31 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 21)) << (31 - 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 20)) << (31 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 19)) << (31 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 18)) << (31 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 17)) << (31 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 16)) << (31 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 15)) << (31 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 14)) << (31 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 13)) << (31 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 12)) << (31 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 11)) << (31 - 11); - out++; - *out = ((*in) >> 11); - ++in; - *out |= ((*in) % (1U << 10)) << (31 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 9)) << (31 - 9); - out++; - *out = ((*in) >> 9); - ++in; - *out |= ((*in) % (1U << 8)) << (31 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 7)) << (31 - 7); - out++; - *out = ((*in) >> 7); - ++in; - *out |= ((*in) % (1U << 6)) << (31 - 6); - out++; - *out = ((*in) >> 6); - ++in; - *out |= ((*in) % (1U << 5)) << (31 - 5); - out++; - *out = ((*in) >> 5); - ++in; - *out |= ((*in) % (1U << 4)) << (31 - 4); - out++; - *out = ((*in) >> 4); - ++in; - *out |= ((*in) % (1U << 3)) << (31 - 3); - out++; - *out = ((*in) >> 3); - ++in; - *out |= ((*in) % (1U << 2)) << (31 - 2); - out++; - *out = ((*in) >> 2); - ++in; - *out |= ((*in) % (1U << 1)) << (31 - 1); - out++; - *out = ((*in) >> 1); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - - return in; -} - -inline const uint32_t* nullunpacker32(const uint32_t* in, uint32_t* out) { - for (int k = 0; k < 32; ++k) { - out[k] = 0; - } - return in; -} - -inline int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - batch_size = batch_size / 32 * 32; - int num_loops = batch_size / 32; - - switch (num_bits) { - case 0: - for (int i = 0; i < num_loops; ++i) - in = nullunpacker32(in, out + i * 32); - break; - case 1: - for (int i = 0; i < num_loops; ++i) - in = unpack1_32(in, out + i * 32); - break; - case 2: - for (int i = 0; i < num_loops; ++i) - in = unpack2_32(in, out + i * 32); - break; - case 3: - for (int i = 0; i < num_loops; ++i) - in = unpack3_32(in, out + i * 32); - break; - case 4: - for (int i = 0; i < num_loops; ++i) - in = unpack4_32(in, out + i * 32); - break; - case 5: - for (int i = 0; i < num_loops; ++i) - in = unpack5_32(in, out + i * 32); - break; - case 6: - for (int i = 0; i < num_loops; ++i) - in = unpack6_32(in, out + i * 32); - break; - case 7: - for (int i = 0; i < num_loops; ++i) - in = unpack7_32(in, out + i * 32); - break; - case 8: - for (int i = 0; i < num_loops; ++i) - in = unpack8_32(in, out + i * 32); - break; - case 9: - for (int i = 0; i < num_loops; ++i) - in = unpack9_32(in, out + i * 32); - break; - case 10: - for (int i = 0; i < num_loops; ++i) - in = unpack10_32(in, out + i * 32); - break; - case 11: - for (int i = 0; i < num_loops; ++i) - in = unpack11_32(in, out + i * 32); - break; - case 12: - for (int i = 0; i < num_loops; ++i) - in = unpack12_32(in, out + i * 32); - break; - case 13: - for (int i = 0; i < num_loops; ++i) - in = unpack13_32(in, out + i * 32); - break; - case 14: - for (int i = 0; i < num_loops; ++i) - in = unpack14_32(in, out + i * 32); - break; - case 15: - for (int i = 0; i < num_loops; ++i) - in = unpack15_32(in, out + i * 32); - break; - case 16: - for (int i = 0; i < num_loops; ++i) - in = unpack16_32(in, out + i * 32); - break; - case 17: - for (int i = 0; i < num_loops; ++i) - in = unpack17_32(in, out + i * 32); - break; - case 18: - for (int i = 0; i < num_loops; ++i) - in = unpack18_32(in, out + i * 32); - break; - case 19: - for (int i = 0; i < num_loops; ++i) - in = unpack19_32(in, out + i * 32); - break; - case 20: - for (int i = 0; i < num_loops; ++i) - in = unpack20_32(in, out + i * 32); - break; - case 21: - for (int i = 0; i < num_loops; ++i) - in = unpack21_32(in, out + i * 32); - break; - case 22: - for (int i = 0; i < num_loops; ++i) - in = unpack22_32(in, out + i * 32); - break; - case 23: - for (int i = 0; i < num_loops; ++i) - in = unpack23_32(in, out + i * 32); - break; - case 24: - for (int i = 0; i < num_loops; ++i) - in = unpack24_32(in, out + i * 32); - break; - case 25: - for (int i = 0; i < num_loops; ++i) - in = unpack25_32(in, out + i * 32); - break; - case 26: - for (int i = 0; i < num_loops; ++i) - in = unpack26_32(in, out + i * 32); - break; - case 27: - for (int i = 0; i < num_loops; ++i) - in = unpack27_32(in, out + i * 32); - break; - case 28: - for (int i = 0; i < num_loops; ++i) - in = unpack28_32(in, out + i * 32); - break; - case 29: - for (int i = 0; i < num_loops; ++i) - in = unpack29_32(in, out + i * 32); - break; - case 30: - for (int i = 0; i < num_loops; ++i) - in = unpack30_32(in, out + i * 32); - break; - case 31: - for (int i = 0; i < num_loops; ++i) - in = unpack31_32(in, out + i * 32); - break; - case 32: - for (int i = 0; i < num_loops; ++i) - in = unpack32_32(in, out + i * 32); - break; - default: - throw std::runtime_error("Unsupported num_bits"); - } - - return batch_size; -} - -}; // namespace parquet - -#endif // PARQUET_UTIL_BPACKING_H diff --git a/src/parquet/util/compiler-util.h b/src/parquet/util/compiler-util.h deleted file mode 100644 index 9d0b2650..00000000 --- a/src/parquet/util/compiler-util.h +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_UTIL_COMPILER_UTIL_H -#define PARQUET_UTIL_COMPILER_UTIL_H - -// Compiler hint that this branch is likely or unlikely to -// be taken. Take from the "What all programmers should know -// about memory" paper. -// example: if (LIKELY(size > 0)) { ... } -// example: if (UNLIKELY(!status.ok())) { ... } -#ifdef LIKELY -#undef LIKELY -#endif - -#ifdef UNLIKELY -#undef UNLIKELY -#endif - -#ifdef _MSC_VER -#define LIKELY(expr) expr -#define UNLIKELY(expr) expr -#else -#define LIKELY(expr) __builtin_expect(!!(expr), 1) -#define UNLIKELY(expr) __builtin_expect(!!(expr), 0) -#endif - -#define PREFETCH(addr) __builtin_prefetch(addr) - -// macros to disable padding -// these macros are portable across different compilers and platforms -//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355] -#if defined(_MSC_VER) -#define MANUALLY_ALIGNED_STRUCT(alignment) \ - __pragma(pack(1)); \ - struct __declspec(align(alignment)) -#define STRUCT_END(name, size) \ - __pragma(pack()); \ - static_assert(sizeof(name) == size, "compiler breaks packing rules") -#elif defined(__GNUC__) || defined(__clang__) -#define MANUALLY_ALIGNED_STRUCT(alignment) \ - _Pragma("pack(1)") struct __attribute__((aligned(alignment))) -#define STRUCT_END(name, size) \ - _Pragma("pack()") static_assert(sizeof(name) == size, "compiler breaks packing rules") -#else -#error Unknown compiler, please define structure alignment macros -#endif - -#endif // PARQUET_UTIL_COMPILER_UTIL_H diff --git a/src/parquet/util/cpu-info.cc b/src/parquet/util/cpu-info.cc deleted file mode 100644 index c6c143ea..00000000 --- a/src/parquet/util/cpu-info.cc +++ /dev/null @@ -1,208 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29. Pared down to a minimal set of -// functions needed for parquet-cpp - -#include "parquet/util/cpu-info.h" - -#ifdef __APPLE__ -#include -#endif - -#include -#include - -#ifndef _MSC_VER -#include -#endif - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "parquet/exception.h" -#include "parquet/util/logging.h" - -using boost::algorithm::contains; -using boost::algorithm::trim; -using std::max; -using std::string; - -namespace parquet { - -bool CpuInfo::initialized_ = false; -int64_t CpuInfo::hardware_flags_ = 0; -int64_t CpuInfo::original_hardware_flags_; -int64_t CpuInfo::cache_sizes_[L3_CACHE + 1]; -int64_t CpuInfo::cycles_per_ms_; -int CpuInfo::num_cores_ = 1; -string CpuInfo::model_name_ = "unknown"; // NOLINT -static std::mutex cpuinfo_mutex; - -static struct { - string name; - int64_t flag; -} flag_mappings[] = { - {"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1}, {"sse4_2", CpuInfo::SSE4_2}, - {"popcnt", CpuInfo::POPCNT}, -}; -static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); - -// Helper function to parse for hardware flags. -// values contains a list of space-seperated flags. check to see if the flags we -// care about are present. -// Returns a bitmap of flags. -int64_t ParseCPUFlags(const string& values) { - int64_t flags = 0; - for (int i = 0; i < num_flags; ++i) { - if (contains(values, flag_mappings[i].name)) { flags |= flag_mappings[i].flag; } - } - return flags; -} - -void CpuInfo::Init() { - std::lock_guard cpuinfo_lock(cpuinfo_mutex); - - if (initialized()) { return; } - - string line; - string name; - string value; - - float max_mhz = 0; - int num_cores = 0; - - memset(&cache_sizes_, 0, sizeof(cache_sizes_)); - - // Read from /proc/cpuinfo - std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in); - while (cpuinfo) { - getline(cpuinfo, line); - size_t colon = line.find(':'); - if (colon != string::npos) { - name = line.substr(0, colon - 1); - value = line.substr(colon + 1, string::npos); - trim(name); - trim(value); - if (name.compare("flags") == 0) { - hardware_flags_ |= ParseCPUFlags(value); - } else if (name.compare("cpu MHz") == 0) { - // Every core will report a different speed. We'll take the max, assuming - // that when impala is running, the core will not be in a lower power state. - // TODO: is there a more robust way to do this, such as - // Window's QueryPerformanceFrequency() - float mhz = static_cast(atof(value.c_str())); - max_mhz = max(mhz, max_mhz); - } else if (name.compare("processor") == 0) { - ++num_cores; - } else if (name.compare("model name") == 0) { - model_name_ = value; - } - } - } - if (cpuinfo.is_open()) cpuinfo.close(); - -#ifdef __APPLE__ - // On Mac OS X use sysctl() to get the cache sizes - size_t len = 0; - sysctlbyname("hw.cachesize", NULL, &len, NULL, 0); - uint64_t* data = static_cast(malloc(len)); - sysctlbyname("hw.cachesize", data, &len, NULL, 0); - DCHECK(len / sizeof(uint64_t) >= 3); - for (size_t i = 0; i < 3; ++i) { - cache_sizes_[i] = data[i]; - } -#else -#ifndef _SC_LEVEL1_DCACHE_SIZE - // Provide reasonable default values if no info - cache_sizes_[0] = 32 * 1024; // Level 1: 32k - cache_sizes_[1] = 256 * 1024; // Level 2: 256k - cache_sizes_[2] = 3072 * 1024; // Level 3: 3M -#else - // Call sysconf to query for the cache sizes - cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE); - cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE); - cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE); -#endif -#endif - - if (max_mhz != 0) { - cycles_per_ms_ = static_cast(max_mhz) * 1000; - } else { - cycles_per_ms_ = 1000000; - } - original_hardware_flags_ = hardware_flags_; - - if (num_cores > 0) { - num_cores_ = num_cores; - } else { - num_cores_ = 1; - } - - initialized_ = true; -} - -void CpuInfo::VerifyCpuRequirements() { - if (!CpuInfo::IsSupported(CpuInfo::SSSE3)) { - throw ParquetException("CPU does not support the Supplemental SSE3 instruction set"); - } -} - -void CpuInfo::EnableFeature(int64_t flag, bool enable) { - DCHECK(initialized_); - if (!enable) { - hardware_flags_ &= ~flag; - } else { - // Can't turn something on that can't be supported - DCHECK((original_hardware_flags_ & flag) != 0); - hardware_flags_ |= flag; - } -} - -int64_t CpuInfo::hardware_flags() { - DCHECK(initialized_); - return hardware_flags_; -} - -int64_t CpuInfo::CacheSize(CacheLevel level) { - DCHECK(initialized_); - return cache_sizes_[level]; -} - -int64_t CpuInfo::cycles_per_ms() { - DCHECK(initialized_); - return cycles_per_ms_; -} - -int CpuInfo::num_cores() { - DCHECK(initialized_); - return num_cores_; -} - -std::string CpuInfo::model_name() { - DCHECK(initialized_); - return model_name_; -} - -} // namespace parquet diff --git a/src/parquet/util/cpu-info.h b/src/parquet/util/cpu-info.h deleted file mode 100644 index 4d7cd273..00000000 --- a/src/parquet/util/cpu-info.h +++ /dev/null @@ -1,92 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29. Pared down to a minimal set of -// functions needed for parquet-cpp - -#ifndef PARQUET_UTIL_CPU_INFO_H -#define PARQUET_UTIL_CPU_INFO_H - -#include -#include - -#include "parquet/util/visibility.h" - -namespace parquet { - -/// CpuInfo is an interface to query for cpu information at runtime. The caller can -/// ask for the sizes of the caches and what hardware features are supported. -/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and -/// /sys/devices) -class PARQUET_EXPORT CpuInfo { - public: - static const int64_t SSSE3 = (1 << 1); - static const int64_t SSE4_1 = (1 << 2); - static const int64_t SSE4_2 = (1 << 3); - static const int64_t POPCNT = (1 << 4); - - /// Cache enums for L1 (data), L2 and L3 - enum CacheLevel { - L1_CACHE = 0, - L2_CACHE = 1, - L3_CACHE = 2, - }; - - /// Initialize CpuInfo. - static void Init(); - - /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error - /// and terminate. - static void VerifyCpuRequirements(); - - /// Returns all the flags for this cpu - static int64_t hardware_flags(); - - /// Returns whether of not the cpu supports this flag - inline static bool IsSupported(int64_t flag) { return (hardware_flags_ & flag) != 0; } - - /// Toggle a hardware feature on and off. It is not valid to turn on a feature - /// that the underlying hardware cannot support. This is useful for testing. - static void EnableFeature(int64_t flag, bool enable); - - /// Returns the size of the cache in KB at this cache level - static int64_t CacheSize(CacheLevel level); - - /// Returns the number of cpu cycles per millisecond - static int64_t cycles_per_ms(); - - /// Returns the number of cores (including hyper-threaded) on this machine. - static int num_cores(); - - /// Returns the model name of the cpu (e.g. Intel i7-2600) - static std::string model_name(); - - static bool initialized() { return initialized_; } - - private: - static bool initialized_; - static int64_t hardware_flags_; - static int64_t original_hardware_flags_; - static int64_t cache_sizes_[L3_CACHE + 1]; - static int64_t cycles_per_ms_; - static int num_cores_; - static std::string model_name_; // NOLINT -}; - -} // namespace parquet - -#endif // PARQUET_UTIL_CPU_INFO_H diff --git a/src/parquet/util/hash-util.h b/src/parquet/util/hash-util.h deleted file mode 100644 index 28a523d7..00000000 --- a/src/parquet/util/hash-util.h +++ /dev/null @@ -1,258 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-02-22 - -#ifndef PARQUET_UTIL_HASH_UTIL_H -#define PARQUET_UTIL_HASH_UTIL_H - -#include - -#include "parquet/util/compiler-util.h" -#include "parquet/util/cpu-info.h" -#include "parquet/util/logging.h" -#include "parquet/util/sse-util.h" - -namespace parquet { - -/// Utility class to compute hash values. -class HashUtil { - public: - /// Compute the Crc32 hash for data using SSE4 instructions. The input hash - /// parameter is the current hash/seed value. - /// This should only be called if SSE is supported. - /// This is ~4x faster than Fnv/Boost Hash. - /// TODO: crc32 hashes with different seeds do not result in different hash functions. - /// The resulting hashes are correlated. - /// TODO: update this to also use SSE4_crc32_u64 and SSE4_crc32_u16 where appropriate. - static uint32_t CrcHash(const void* data, int32_t bytes, uint32_t hash) { - DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2)); - uint32_t words = bytes / sizeof(uint32_t); - bytes = bytes % sizeof(uint32_t); - - const uint32_t* p = reinterpret_cast(data); - while (words--) { - hash = SSE4_crc32_u32(hash, *p); - ++p; - } - - const uint8_t* s = reinterpret_cast(p); - while (bytes--) { - hash = SSE4_crc32_u8(hash, *s); - ++s; - } - - // The lower half of the CRC hash has has poor uniformity, so swap the halves - // for anyone who only uses the first several bits of the hash. - hash = (hash << 16) | (hash >> 16); - return hash; - } - - /// CrcHash() specialized for 1-byte data - static inline uint32_t CrcHash1(const void* v, uint32_t hash) { - DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2)); - const uint8_t* s = reinterpret_cast(v); - hash = SSE4_crc32_u8(hash, *s); - hash = (hash << 16) | (hash >> 16); - return hash; - } - - /// CrcHash() specialized for 2-byte data - static inline uint32_t CrcHash2(const void* v, uint32_t hash) { - DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2)); - const uint16_t* s = reinterpret_cast(v); - hash = SSE4_crc32_u16(hash, *s); - hash = (hash << 16) | (hash >> 16); - return hash; - } - - /// CrcHash() specialized for 4-byte data - static inline uint32_t CrcHash4(const void* v, uint32_t hash) { - DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2)); - const uint32_t* p = reinterpret_cast(v); - hash = SSE4_crc32_u32(hash, *p); - hash = (hash << 16) | (hash >> 16); - return hash; - } - - /// CrcHash() specialized for 8-byte data - static inline uint32_t CrcHash8(const void* v, uint32_t hash) { - DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2)); - const uint64_t* p = reinterpret_cast(v); - hash = SSE4_crc32_u64(hash, *p); - hash = (hash << 16) | (hash >> 16); - return hash; - } - - /// CrcHash() specialized for 12-byte data - static inline uint32_t CrcHash12(const void* v, uint32_t hash) { - DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2)); - const uint64_t* p = reinterpret_cast(v); - hash = SSE4_crc32_u64(hash, *p); - ++p; - hash = SSE4_crc32_u32(hash, *reinterpret_cast(p)); - hash = (hash << 16) | (hash >> 16); - return hash; - } - - /// CrcHash() specialized for 16-byte data - static inline uint32_t CrcHash16(const void* v, uint32_t hash) { - DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2)); - const uint64_t* p = reinterpret_cast(v); - hash = SSE4_crc32_u64(hash, *p); - ++p; - hash = SSE4_crc32_u64(hash, *p); - hash = (hash << 16) | (hash >> 16); - return hash; - } - - static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995; - static const int MURMUR_R = 47; - - /// Murmur2 hash implementation returning 64-bit hashes. - static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) { - uint64_t h = seed ^ (len * MURMUR_PRIME); - - const uint64_t* data = reinterpret_cast(input); - const uint64_t* end = data + (len / sizeof(uint64_t)); - - while (data != end) { - uint64_t k = *data++; - k *= MURMUR_PRIME; - k ^= k >> MURMUR_R; - k *= MURMUR_PRIME; - h ^= k; - h *= MURMUR_PRIME; - } - - const uint8_t* data2 = reinterpret_cast(data); - switch (len & 7) { - case 7: - h ^= uint64_t(data2[6]) << 48; - case 6: - h ^= uint64_t(data2[5]) << 40; - case 5: - h ^= uint64_t(data2[4]) << 32; - case 4: - h ^= uint64_t(data2[3]) << 24; - case 3: - h ^= uint64_t(data2[2]) << 16; - case 2: - h ^= uint64_t(data2[1]) << 8; - case 1: - h ^= uint64_t(data2[0]); - h *= MURMUR_PRIME; - } - - h ^= h >> MURMUR_R; - h *= MURMUR_PRIME; - h ^= h >> MURMUR_R; - return h; - } - - /// default values recommended by http://isthe.com/chongo/tech/comp/fnv/ - static const uint32_t FNV_PRIME = 0x01000193; // 16777619 - static const uint32_t FNV_SEED = 0x811C9DC5; // 2166136261 - static const uint64_t FNV64_PRIME = 1099511628211UL; - static const uint64_t FNV64_SEED = 14695981039346656037UL; - - /// Implementation of the Fowler-Noll-Vo hash function. This is not as performant - /// as boost's hash on int types (2x slower) but has bit entropy. - /// For ints, boost just returns the value of the int which can be pathological. - /// For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000 - /// is taken on the hash, all values will collide to the same bucket. - /// For string values, Fnv is slightly faster than boost. - /// IMPORTANT: FNV hash suffers from poor diffusion of the least significant bit, - /// which can lead to poor results when input bytes are duplicated. - /// See FnvHash64to32() for how this can be mitigated. - static uint64_t FnvHash64(const void* data, int32_t bytes, uint64_t hash) { - const uint8_t* ptr = reinterpret_cast(data); - while (bytes--) { - hash = (*ptr ^ hash) * FNV64_PRIME; - ++ptr; - } - return hash; - } - - /// Return a 32-bit hash computed by invoking FNV-64 and folding the result to 32-bits. - /// This technique is recommended instead of FNV-32 since the LSB of an FNV hash is the - /// XOR of the LSBs of its input bytes, leading to poor results for duplicate inputs. - /// The input seed 'hash' is duplicated so the top half of the seed is not all zero. - /// Data length must be at least 1 byte: zero-length data should be handled separately, - /// for example using CombineHash with a unique constant value to avoid returning the - /// hash argument. Zero-length data gives terrible results: the initial hash value is - /// xored with itself cancelling all bits. - static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) { - // IMPALA-2270: this function should never be used for zero-byte inputs. - DCHECK_GT(bytes, 0); - uint64_t hash_u64 = hash | ((uint64_t)hash << 32); - hash_u64 = FnvHash64(data, bytes, hash_u64); - return (hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF); - } - - /// Computes the hash value for data. Will call either CrcHash or MurmurHash - /// depending on hardware capabilities. - /// Seed values for different steps of the query execution should use different seeds - /// to prevent accidental key collisions. (See IMPALA-219 for more details). - static uint32_t Hash(const void* data, int32_t bytes, uint32_t seed) { -#ifdef PARQUET_USE_SSE - if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSE4_2))) { - return CrcHash(data, bytes, seed); - } else { - return MurmurHash2_64(data, bytes, seed); - } -#else - return static_cast(MurmurHash2_64(data, bytes, seed)); -#endif - } - - /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio). - static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9; - - /// Combine hashes 'value' and 'seed' to get a new hash value. Similar to - /// boost::hash_combine(), but for uint32_t. This function should be used with a - /// constant first argument to update the hash value for zero-length values such as - /// NULL, boolean, and empty strings. - static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) { - return seed ^ (HASH_COMBINE_SEED + value + (seed << 6) + (seed >> 2)); - } - - // Get 32 more bits of randomness from a 32-bit hash: - static inline uint32_t Rehash32to32(const uint32_t hash) { - // Constants generated by uuidgen(1) with the -r flag - static const uint64_t m = 0x7850f11ec6d14889ull, a = 0x6773610597ca4c63ull; - // This is strongly universal hashing following Dietzfelbinger's "Universal hashing - // and k-wise independent random variables via integer arithmetic without primes". As - // such, for any two distinct uint32_t's hash1 and hash2, the probability (over the - // randomness of the constants) that any subset of bit positions of - // Rehash32to32(hash1) is equal to the same subset of bit positions - // Rehash32to32(hash2) is minimal. - return (static_cast(hash) * m + a) >> 32; - } - - static inline uint64_t Rehash32to64(const uint32_t hash) { - static const uint64_t m1 = 0x47b6137a44974d91ull, m2 = 0x8824ad5ba2b7289cull, - a1 = 0x705495c62df1424aull, a2 = 0x9efc49475c6bfb31ull; - const uint64_t hash1 = (static_cast(hash) * m1 + a1) >> 32; - const uint64_t hash2 = (static_cast(hash) * m2 + a2) >> 32; - return hash1 | (hash2 << 32); - } -}; - -} // namespace parquet - -#endif // PARQUET_UTIL_HASH_UTIL_H diff --git a/src/parquet/util/logging.h b/src/parquet/util/logging.h index 7ec5a8f5..e2c7abb1 100644 --- a/src/parquet/util/logging.h +++ b/src/parquet/util/logging.h @@ -18,109 +18,6 @@ #ifndef PARQUET_UTIL_LOGGING_H #define PARQUET_UTIL_LOGGING_H -#include - -namespace parquet { - -// Stubbed versions of macros defined in glog/logging.h, intended for -// environments where glog headers aren't available. -// -// Add more as needed. - -// Log levels. LOG ignores them, so their values are abitrary. - -#define PARQUET_INFO 0 -#define PARQUET_WARNING 1 -#define PARQUET_ERROR 2 -#define PARQUET_FATAL 3 - -#define PARQUET_LOG_INTERNAL(level) ::parquet::internal::CerrLog(level) -#define PARQUET_LOG(level) PARQUET_LOG_INTERNAL(PARQUET_##level) -#define PARQUET_IGNORE_EXPR(expr) ((void)(expr)); - -#define PARQUET_CHECK(condition) \ - (condition) ? 0 : PARQUET_LOG(FATAL) << "Check failed: " #condition " " - -#ifdef NDEBUG -#define PARQUET_DFATAL PARQUET_WARNING - -#define DCHECK(condition) \ - PARQUET_IGNORE_EXPR(condition) \ - while (false) \ - ::parquet::internal::NullLog() -#define DCHECK_EQ(val1, val2) \ - PARQUET_IGNORE_EXPR(val1) \ - while (false) \ - ::parquet::internal::NullLog() -#define DCHECK_NE(val1, val2) \ - PARQUET_IGNORE_EXPR(val1) \ - while (false) \ - ::parquet::internal::NullLog() -#define DCHECK_LE(val1, val2) \ - PARQUET_IGNORE_EXPR(val1) \ - while (false) \ - ::parquet::internal::NullLog() -#define DCHECK_LT(val1, val2) \ - PARQUET_IGNORE_EXPR(val1) \ - while (false) \ - ::parquet::internal::NullLog() -#define DCHECK_GE(val1, val2) \ - PARQUET_IGNORE_EXPR(val1) \ - while (false) \ - ::parquet::internal::NullLog() -#define DCHECK_GT(val1, val2) \ - PARQUET_IGNORE_EXPR(val1) \ - while (false) \ - ::parquet::internal::NullLog() - -#else -#define PARQUET_DFATAL PARQUET_FATAL - -#define DCHECK(condition) PARQUET_CHECK(condition) -#define DCHECK_EQ(val1, val2) PARQUET_CHECK((val1) == (val2)) -#define DCHECK_NE(val1, val2) PARQUET_CHECK((val1) != (val2)) -#define DCHECK_LE(val1, val2) PARQUET_CHECK((val1) <= (val2)) -#define DCHECK_LT(val1, val2) PARQUET_CHECK((val1) < (val2)) -#define DCHECK_GE(val1, val2) PARQUET_CHECK((val1) >= (val2)) -#define DCHECK_GT(val1, val2) PARQUET_CHECK((val1) > (val2)) - -#endif // NDEBUG - -namespace internal { - -class NullLog { - public: - template - NullLog& operator<<(const T& t) { - return *this; - } -}; - -class CerrLog { - public: - CerrLog(int severity) // NOLINT(runtime/explicit) - : severity_(severity), - has_logged_(false) {} - - ~CerrLog() { - if (has_logged_) { std::cerr << std::endl; } - if (severity_ == PARQUET_FATAL) { exit(1); } - } - - template - CerrLog& operator<<(const T& t) { - has_logged_ = true; - std::cerr << t; - return *this; - } - - private: - const int severity_; - bool has_logged_; -}; - -} // namespace internal - -} // namespace parquet +#include "arrow/util/logging.h" #endif // PARQUET_UTIL_LOGGING_H diff --git a/src/parquet/util/memory.cc b/src/parquet/util/memory.cc index 96131821..39c43fb3 100644 --- a/src/parquet/util/memory.cc +++ b/src/parquet/util/memory.cc @@ -24,10 +24,10 @@ #include #include "arrow/status.h" +#include "arrow/util/bit-util.h" #include "parquet/exception.h" #include "parquet/types.h" -#include "parquet/util/bit-util.h" #include "parquet/util/logging.h" using arrow::MemoryPool; @@ -123,7 +123,7 @@ template uint8_t* ChunkedAllocator::Allocate(int size) { if (size == 0) return NULL; - int64_t num_bytes = BitUtil::RoundUp(size, 8); + int64_t num_bytes = ::arrow::BitUtil::RoundUp(size, 8); if (current_chunk_idx_ == -1 || num_bytes + chunks_[current_chunk_idx_].allocated_bytes > chunks_[current_chunk_idx_].size) { diff --git a/src/parquet/util/memory.h b/src/parquet/util/memory.h index 2e7eb0f7..d73a3de7 100644 --- a/src/parquet/util/memory.h +++ b/src/parquet/util/memory.h @@ -80,6 +80,8 @@ class PARQUET_EXPORT Vector { void Swap(Vector& v); inline T& operator[](int64_t i) const { return data_[i]; } + const T* data() const { return data_; } + private: std::unique_ptr buffer_; int64_t size_; diff --git a/src/parquet/util/rle-encoding.h b/src/parquet/util/rle-encoding.h deleted file mode 100644 index d9eb6a29..00000000 --- a/src/parquet/util/rle-encoding.h +++ /dev/null @@ -1,599 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29 - -#ifndef PARQUET_UTIL_RLE_ENCODING_H -#define PARQUET_UTIL_RLE_ENCODING_H - -#include -#include - -#include "parquet/util/bit-stream-utils.inline.h" -#include "parquet/util/bit-util.h" -#include "parquet/util/compiler-util.h" -#include "parquet/util/memory.h" - -namespace parquet { - -/// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs -/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed -/// (literal encoding). -/// For both types of runs, there is a byte-aligned indicator which encodes the length -/// of the run and the type of the run. -/// This encoding has the benefit that when there aren't any long enough runs, values -/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and -/// the run length are byte aligned. This allows for very efficient decoding -/// implementations. -/// The encoding is: -/// encoded-block := run* -/// run := literal-run | repeated-run -/// literal-run := literal-indicator < literal bytes > -/// repeated-run := repeated-indicator < repeated value. padded to byte boundary > -/// literal-indicator := varint_encode( number_of_groups << 1 | 1) -/// repeated-indicator := varint_encode( number_of_repetitions << 1 ) -// -/// Each run is preceded by a varint. The varint's least significant bit is -/// used to indicate whether the run is a literal run or a repeated run. The rest -/// of the varint is used to determine the length of the run (eg how many times the -/// value repeats). -// -/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode -/// in groups of 8), so that no matter the bit-width of the value, the sequence will end -/// on a byte boundary without padding. -/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than -/// the actual number of encoded ints. (This means that the total number of encoded values -/// can not be determined from the encoded data, since the number of values in the last -/// group may not be a multiple of 8). For the last group of literal runs, we pad -/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side -/// without the need for additional checks. -// -/// There is a break-even point when it is more storage efficient to do run length -/// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes -/// for both the repeated encoding or the literal encoding. This value can always -/// be computed based on the bit-width. -/// TODO: think about how to use this for strings. The bit packing isn't quite the same. -// -/// Examples with bit-width 1 (eg encoding booleans): -/// ---------------------------------------- -/// 100 1s followed by 100 0s: -/// <1, padded to 1 byte>   <0, padded to 1 byte> -/// - (total 4 bytes) -// -/// alternating 1s and 0s (200 total): -/// 200 ints = 25 groups of 8 -/// <25 bytes of values, bitpacked> -/// (total 26 bytes, 1 byte overhead) -// - -/// Decoder class for RLE encoded data. -class RleDecoder { - public: - /// Create a decoder object. buffer/buffer_len is the decoded data. - /// bit_width is the width of each value (before encoding). - RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) - : bit_reader_(buffer, buffer_len), - bit_width_(bit_width), - current_value_(0), - repeat_count_(0), - literal_count_(0) { - DCHECK_GE(bit_width_, 0); - DCHECK_LE(bit_width_, 64); - } - - RleDecoder() : bit_width_(-1) {} - - void Reset(const uint8_t* buffer, int buffer_len, int bit_width) { - DCHECK_GE(bit_width, 0); - DCHECK_LE(bit_width, 64); - bit_reader_.Reset(buffer, buffer_len); - bit_width_ = bit_width; - current_value_ = 0; - repeat_count_ = 0; - literal_count_ = 0; - } - - /// Gets the next value. Returns false if there are no more. - template - bool Get(T* val); - - /// Gets a batch of values. Returns the number of decoded elements. - template - int GetBatch(T* values, int batch_size); - - /// Like GetBatch but the values are then decoded using the provided dictionary - template - int GetBatchWithDict(const Vector& dictionary, T* values, int batch_size); - - /// Like GetBatchWithDict but add spacing for null entries - template - int GetBatchWithDictSpaced(const Vector& dictionary, T* values, int batch_size, - int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset); - - protected: - BitReader bit_reader_; - /// Number of bits needed to encode the value. Must be between 0 and 64. - int bit_width_; - uint64_t current_value_; - uint32_t repeat_count_; - uint32_t literal_count_; - - private: - /// Fills literal_count_ and repeat_count_ with next values. Returns false if there - /// are no more. - template - bool NextCounts(); -}; - -/// Class to incrementally build the rle data. This class does not allocate any memory. -/// The encoding has two modes: encoding repeated runs and literal runs. -/// If the run is sufficiently short, it is more efficient to encode as a literal run. -/// This class does so by buffering 8 values at a time. If they are not all the same -/// they are added to the literal run. If they are the same, they are added to the -/// repeated run. When we switch modes, the previous run is flushed out. -class RleEncoder { - public: - /// buffer/buffer_len: preallocated output buffer. - /// bit_width: max number of bits for value. - /// TODO: consider adding a min_repeated_run_length so the caller can control - /// when values should be encoded as repeated runs. Currently this is derived - /// based on the bit_width, which can determine a storage optimal choice. - /// TODO: allow 0 bit_width (and have dict encoder use it) - RleEncoder(uint8_t* buffer, int buffer_len, int bit_width) - : bit_width_(bit_width), bit_writer_(buffer, buffer_len) { - DCHECK_GE(bit_width_, 0); - DCHECK_LE(bit_width_, 64); - max_run_byte_size_ = MinBufferSize(bit_width); - DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough."; - Clear(); - } - - /// Returns the minimum buffer size needed to use the encoder for 'bit_width' - /// This is the maximum length of a single run for 'bit_width'. - /// It is not valid to pass a buffer less than this length. - static int MinBufferSize(int bit_width) { - /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. - int max_literal_run_size = - 1 + static_cast(BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8)); - /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value. - int max_repeated_run_size = - BitReader::MAX_VLQ_BYTE_LEN + static_cast(BitUtil::Ceil(bit_width, 8)); - return std::max(max_literal_run_size, max_repeated_run_size); - } - - /// Returns the maximum byte size it could take to encode 'num_values'. - static int MaxBufferSize(int bit_width, int num_values) { - // For a bit_width > 1, the worst case is the repetition of "literal run of length 8 - // and then a repeated run of length 8". - // 8 values per smallest run, 8 bits per byte - // int bytes_per_run = BitUtil::Ceil(bit_width * 8, 8); - int bytes_per_run = bit_width; - int num_runs = static_cast(BitUtil::Ceil(num_values, 8)); - int literal_max_size = num_runs + num_runs * bytes_per_run; - - // In the very worst case scenario, the data is a concatenation of repeated - // runs of 8 values. Repeated run has a 1 byte varint followed by the - // bit-packed repeated value - int min_repeated_run_size = 1 + static_cast(BitUtil::Ceil(bit_width, 8)); - int repeated_max_size = - static_cast(BitUtil::Ceil(num_values, 8)) * min_repeated_run_size; - - return std::max(literal_max_size, repeated_max_size); - } - - /// Encode value. Returns true if the value fits in buffer, false otherwise. - /// This value must be representable with bit_width_ bits. - bool Put(uint64_t value); - - /// Flushes any pending values to the underlying buffer. - /// Returns the total number of bytes written - int Flush(); - - /// Resets all the state in the encoder. - void Clear(); - - /// Returns pointer to underlying buffer - uint8_t* buffer() { return bit_writer_.buffer(); } - int32_t len() { return bit_writer_.bytes_written(); } - - private: - /// Flushes any buffered values. If this is part of a repeated run, this is largely - /// a no-op. - /// If it is part of a literal run, this will call FlushLiteralRun, which writes - /// out the buffered literal values. - /// If 'done' is true, the current run would be written even if it would normally - /// have been buffered more. This should only be called at the end, when the - /// encoder has received all values even if it would normally continue to be - /// buffered. - void FlushBufferedValues(bool done); - - /// Flushes literal values to the underlying buffer. If update_indicator_byte, - /// then the current literal run is complete and the indicator byte is updated. - void FlushLiteralRun(bool update_indicator_byte); - - /// Flushes a repeated run to the underlying buffer. - void FlushRepeatedRun(); - - /// Checks and sets buffer_full_. This must be called after flushing a run to - /// make sure there are enough bytes remaining to encode the next run. - void CheckBufferFull(); - - /// The maximum number of values in a single literal run - /// (number of groups encodable by a 1-byte indicator * 8) - static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8; - - /// Number of bits needed to encode the value. Must be between 0 and 64. - const int bit_width_; - - /// Underlying buffer. - BitWriter bit_writer_; - - /// If true, the buffer is full and subsequent Put()'s will fail. - bool buffer_full_; - - /// The maximum byte size a single run can take. - int max_run_byte_size_; - - /// We need to buffer at most 8 values for literals. This happens when the - /// bit_width is 1 (so 8 values fit in one byte). - /// TODO: generalize this to other bit widths - int64_t buffered_values_[8]; - - /// Number of values in buffered_values_ - int num_buffered_values_; - - /// The current (also last) value that was written and the count of how - /// many times in a row that value has been seen. This is maintained even - /// if we are in a literal run. If the repeat_count_ get high enough, we switch - /// to encoding repeated runs. - uint64_t current_value_; - int repeat_count_; - - /// Number of literals in the current run. This does not include the literals - /// that might be in buffered_values_. Only after we've got a group big enough - /// can we decide if they should part of the literal_count_ or repeat_count_ - int literal_count_; - - /// Pointer to a byte in the underlying buffer that stores the indicator byte. - /// This is reserved as soon as we need a literal run but the value is written - /// when the literal run is complete. - uint8_t* literal_indicator_byte_; -}; - -template -inline bool RleDecoder::Get(T* val) { - return GetBatch(val, 1) == 1; -} - -template -inline int RleDecoder::GetBatch(T* values, int batch_size) { - DCHECK_GE(bit_width_, 0); - int values_read = 0; - - while (values_read < batch_size) { - if (repeat_count_ > 0) { - int repeat_batch = - std::min(batch_size - values_read, static_cast(repeat_count_)); - std::fill(values + values_read, values + values_read + repeat_batch, - static_cast(current_value_)); - repeat_count_ -= repeat_batch; - values_read += repeat_batch; - } else if (literal_count_ > 0) { - int literal_batch = - std::min(batch_size - values_read, static_cast(literal_count_)); - int actual_read = - bit_reader_.GetBatch(bit_width_, values + values_read, literal_batch); - DCHECK_EQ(actual_read, literal_batch); - literal_count_ -= literal_batch; - values_read += literal_batch; - } else { - if (!NextCounts()) return values_read; - } - } - - return values_read; -} - -template -inline int RleDecoder::GetBatchWithDict( - const Vector& dictionary, T* values, int batch_size) { - DCHECK_GE(bit_width_, 0); - int values_read = 0; - - while (values_read < batch_size) { - if (repeat_count_ > 0) { - int repeat_batch = - std::min(batch_size - values_read, static_cast(repeat_count_)); - std::fill(values + values_read, values + values_read + repeat_batch, - dictionary[current_value_]); - repeat_count_ -= repeat_batch; - values_read += repeat_batch; - } else if (literal_count_ > 0) { - int literal_batch = - std::min(batch_size - values_read, static_cast(literal_count_)); - - const int buffer_size = 1024; - int indices[buffer_size]; - literal_batch = std::min(literal_batch, buffer_size); - int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch); - DCHECK_EQ(actual_read, literal_batch); - for (int i = 0; i < literal_batch; ++i) { - values[values_read + i] = dictionary[indices[i]]; - } - literal_count_ -= literal_batch; - values_read += literal_batch; - } else { - if (!NextCounts()) return values_read; - } - } - - return values_read; -} - -template -inline int RleDecoder::GetBatchWithDictSpaced(const Vector& dictionary, T* values, - int batch_size, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset) { - DCHECK_GE(bit_width_, 0); - int values_read = 0; - int remaining_nulls = null_count; - INIT_BITSET(valid_bits, static_cast(valid_bits_offset)); - - while (values_read < batch_size) { - bool is_valid = (bitset_valid_bits & (1 << bit_offset_valid_bits)) != 0; - READ_NEXT_BITSET(valid_bits); - - if (is_valid) { - if ((repeat_count_ == 0) && (literal_count_ == 0)) { - if (!NextCounts()) return values_read; - } - if (repeat_count_ > 0) { - T value = dictionary[current_value_]; - // The current index is already valid, we don't need to check that again - int repeat_batch = 1; - repeat_count_--; - - while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) { - if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { - repeat_count_--; - } else { - remaining_nulls--; - } - repeat_batch++; - - READ_NEXT_BITSET(valid_bits); - } - std::fill(values + values_read, values + values_read + repeat_batch, value); - values_read += repeat_batch; - } else if (literal_count_ > 0) { - int literal_batch = std::min( - batch_size - values_read - remaining_nulls, static_cast(literal_count_)); - - // Decode the literals - constexpr int kBufferSize = 1024; - int indices[kBufferSize]; - literal_batch = std::min(literal_batch, kBufferSize); - int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch); - DCHECK_EQ(actual_read, literal_batch); - - int skipped = 0; - int literals_read = 1; - values[values_read] = dictionary[indices[0]]; - - // Read the first bitset to the end - while (literals_read < literal_batch) { - if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { - values[values_read + literals_read + skipped] = - dictionary[indices[literals_read]]; - literals_read++; - } else { - skipped++; - } - - READ_NEXT_BITSET(valid_bits); - } - literal_count_ -= literal_batch; - values_read += literal_batch + skipped; - remaining_nulls -= skipped; - } - } else { - values_read++; - remaining_nulls--; - } - } - - return values_read; -} - -template -bool RleDecoder::NextCounts() { - // Read the next run's indicator int, it could be a literal or repeated run. - // The int is encoded as a vlq-encoded value. - int32_t indicator_value = 0; - bool result = bit_reader_.GetVlqInt(&indicator_value); - if (!result) return false; - - // lsb indicates if it is a literal run or repeated run - bool is_literal = indicator_value & 1; - if (is_literal) { - literal_count_ = (indicator_value >> 1) * 8; - } else { - repeat_count_ = indicator_value >> 1; - bool result = - bit_reader_.GetAligned(static_cast(BitUtil::Ceil(bit_width_, 8)), - reinterpret_cast(¤t_value_)); - DCHECK(result); - } - return true; -} - -/// This function buffers input values 8 at a time. After seeing all 8 values, -/// it decides whether they should be encoded as a literal or repeated run. -inline bool RleEncoder::Put(uint64_t value) { - DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_)); - if (UNLIKELY(buffer_full_)) return false; - - if (LIKELY(current_value_ == value)) { - ++repeat_count_; - if (repeat_count_ > 8) { - // This is just a continuation of the current run, no need to buffer the - // values. - // Note that this is the fast path for long repeated runs. - return true; - } - } else { - if (repeat_count_ >= 8) { - // We had a run that was long enough but it has ended. Flush the - // current repeated run. - DCHECK_EQ(literal_count_, 0); - FlushRepeatedRun(); - } - repeat_count_ = 1; - current_value_ = value; - } - - buffered_values_[num_buffered_values_] = value; - if (++num_buffered_values_ == 8) { - DCHECK_EQ(literal_count_ % 8, 0); - FlushBufferedValues(false); - } - return true; -} - -inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { - if (literal_indicator_byte_ == NULL) { - // The literal indicator byte has not been reserved yet, get one now. - literal_indicator_byte_ = bit_writer_.GetNextBytePtr(); - DCHECK(literal_indicator_byte_ != NULL); - } - - // Write all the buffered values as bit packed literals - for (int i = 0; i < num_buffered_values_; ++i) { - bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_); - DCHECK(success) << "There is a bug in using CheckBufferFull()"; - } - num_buffered_values_ = 0; - - if (update_indicator_byte) { - // At this point we need to write the indicator byte for the literal run. - // We only reserve one byte, to allow for streaming writes of literal values. - // The logic makes sure we flush literal runs often enough to not overrun - // the 1 byte. - DCHECK_EQ(literal_count_ % 8, 0); - int num_groups = literal_count_ / 8; - int32_t indicator_value = (num_groups << 1) | 1; - DCHECK_EQ(indicator_value & 0xFFFFFF00, 0); - *literal_indicator_byte_ = indicator_value; - literal_indicator_byte_ = NULL; - literal_count_ = 0; - CheckBufferFull(); - } -} - -inline void RleEncoder::FlushRepeatedRun() { - DCHECK_GT(repeat_count_, 0); - bool result = true; - // The lsb of 0 indicates this is a repeated run - int32_t indicator_value = repeat_count_ << 1 | 0; - result &= bit_writer_.PutVlqInt(indicator_value); - result &= bit_writer_.PutAligned( - current_value_, static_cast(BitUtil::Ceil(bit_width_, 8))); - DCHECK(result); - num_buffered_values_ = 0; - repeat_count_ = 0; - CheckBufferFull(); -} - -/// Flush the values that have been buffered. At this point we decide whether -/// we need to switch between the run types or continue the current one. -inline void RleEncoder::FlushBufferedValues(bool done) { - if (repeat_count_ >= 8) { - // Clear the buffered values. They are part of the repeated run now and we - // don't want to flush them out as literals. - num_buffered_values_ = 0; - if (literal_count_ != 0) { - // There was a current literal run. All the values in it have been flushed - // but we still need to update the indicator byte. - DCHECK_EQ(literal_count_ % 8, 0); - DCHECK_EQ(repeat_count_, 8); - FlushLiteralRun(true); - } - DCHECK_EQ(literal_count_, 0); - return; - } - - literal_count_ += num_buffered_values_; - DCHECK_EQ(literal_count_ % 8, 0); - int num_groups = literal_count_ / 8; - if (num_groups + 1 >= (1 << 6)) { - // We need to start a new literal run because the indicator byte we've reserved - // cannot store more values. - DCHECK(literal_indicator_byte_ != NULL); - FlushLiteralRun(true); - } else { - FlushLiteralRun(done); - } - repeat_count_ = 0; -} - -inline int RleEncoder::Flush() { - if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { - bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ || - num_buffered_values_ == 0); - // There is something pending, figure out if it's a repeated or literal run - if (repeat_count_ > 0 && all_repeat) { - FlushRepeatedRun(); - } else { - DCHECK_EQ(literal_count_ % 8, 0); - // Buffer the last group of literals to 8 by padding with 0s. - for (; num_buffered_values_ != 0 && num_buffered_values_ < 8; - ++num_buffered_values_) { - buffered_values_[num_buffered_values_] = 0; - } - literal_count_ += num_buffered_values_; - FlushLiteralRun(true); - repeat_count_ = 0; - } - } - bit_writer_.Flush(); - DCHECK_EQ(num_buffered_values_, 0); - DCHECK_EQ(literal_count_, 0); - DCHECK_EQ(repeat_count_, 0); - - return bit_writer_.bytes_written(); -} - -inline void RleEncoder::CheckBufferFull() { - int bytes_written = bit_writer_.bytes_written(); - if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) { - buffer_full_ = true; - } -} - -inline void RleEncoder::Clear() { - buffer_full_ = false; - current_value_ = 0; - repeat_count_ = 0; - num_buffered_values_ = 0; - literal_count_ = 0; - literal_indicator_byte_ = NULL; - bit_writer_.Clear(); -} - -} // namespace parquet - -#endif // PARQUET_UTIL_RLE_ENCODING_H diff --git a/src/parquet/util/rle-test.cc b/src/parquet/util/rle-test.cc deleted file mode 100644 index 4aa96c3e..00000000 --- a/src/parquet/util/rle-test.cc +++ /dev/null @@ -1,460 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29 - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - -#include "parquet/util/bit-stream-utils.inline.h" -#include "parquet/util/rle-encoding.h" - -using std::vector; - -namespace parquet { - -const int MAX_WIDTH = 32; - -TEST(BitArray, TestBool) { - const int len = 8; - uint8_t buffer[len]; - - BitWriter writer(buffer, len); - - // Write alternating 0's and 1's - for (int i = 0; i < 8; ++i) { - bool result = writer.PutValue(i % 2, 1); - EXPECT_TRUE(result); - } - writer.Flush(); - EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0)); - - // Write 00110011 - for (int i = 0; i < 8; ++i) { - bool result = false; - switch (i) { - case 0: - case 1: - case 4: - case 5: - result = writer.PutValue(false, 1); - break; - default: - result = writer.PutValue(true, 1); - break; - } - EXPECT_TRUE(result); - } - writer.Flush(); - - // Validate the exact bit value - EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0)); - EXPECT_EQ((int)buffer[1], BOOST_BINARY(1 1 0 0 1 1 0 0)); - - // Use the reader and validate - BitReader reader(buffer, len); - for (int i = 0; i < 8; ++i) { - bool val = false; - bool result = reader.GetValue(1, &val); - EXPECT_TRUE(result); - EXPECT_EQ(val, (i % 2) != 0); - } - - for (int i = 0; i < 8; ++i) { - bool val = false; - bool result = reader.GetValue(1, &val); - EXPECT_TRUE(result); - switch (i) { - case 0: - case 1: - case 4: - case 5: - EXPECT_EQ(val, false); - break; - default: - EXPECT_EQ(val, true); - break; - } - } -} - -// Writes 'num_vals' values with width 'bit_width' and reads them back. -void TestBitArrayValues(int bit_width, int num_vals) { - int len = static_cast(BitUtil::Ceil(bit_width * num_vals, 8)); - EXPECT_TRUE(len > 0); - const uint64_t mod = bit_width == 64 ? 1 : 1LL << bit_width; - - std::vector buffer(len); - BitWriter writer(buffer.data(), len); - for (int i = 0; i < num_vals; ++i) { - bool result = writer.PutValue(i % mod, bit_width); - EXPECT_TRUE(result); - } - writer.Flush(); - EXPECT_EQ(writer.bytes_written(), len); - - BitReader reader(buffer.data(), len); - for (int i = 0; i < num_vals; ++i) { - int64_t val = 0; - bool result = reader.GetValue(bit_width, &val); - EXPECT_TRUE(result); - EXPECT_EQ(val, i % mod); - } - EXPECT_EQ(reader.bytes_left(), 0); -} - -TEST(BitArray, TestValues) { - for (int width = 1; width <= MAX_WIDTH; ++width) { - TestBitArrayValues(width, 1); - TestBitArrayValues(width, 2); - // Don't write too many values - TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096); - TestBitArrayValues(width, 1024); - } -} - -// Test some mixed values -TEST(BitArray, TestMixed) { - const int len = 1024; - uint8_t buffer[len]; - bool parity = true; - - BitWriter writer(buffer, len); - for (int i = 0; i < len; ++i) { - bool result; - if (i % 2 == 0) { - result = writer.PutValue(parity, 1); - parity = !parity; - } else { - result = writer.PutValue(i, 10); - } - EXPECT_TRUE(result); - } - writer.Flush(); - - parity = true; - BitReader reader(buffer, len); - for (int i = 0; i < len; ++i) { - bool result; - if (i % 2 == 0) { - bool val; - result = reader.GetValue(1, &val); - EXPECT_EQ(val, parity); - parity = !parity; - } else { - int val; - result = reader.GetValue(10, &val); - EXPECT_EQ(val, i); - } - EXPECT_TRUE(result); - } -} - -// Validates encoding of values by encoding and decoding them. If -// expected_encoding != NULL, also validates that the encoded buffer is -// exactly 'expected_encoding'. -// if expected_len is not -1, it will validate the encoded size is correct. -void ValidateRle(const vector& values, int bit_width, uint8_t* expected_encoding, - int expected_len) { - const int len = 64 * 1024; - uint8_t buffer[len]; - EXPECT_LE(expected_len, len); - - RleEncoder encoder(buffer, len, bit_width); - for (size_t i = 0; i < values.size(); ++i) { - bool result = encoder.Put(values[i]); - EXPECT_TRUE(result); - } - int encoded_len = encoder.Flush(); - - if (expected_len != -1) { EXPECT_EQ(encoded_len, expected_len); } - if (expected_encoding != NULL) { - EXPECT_TRUE(memcmp(buffer, expected_encoding, expected_len) == 0); - } - - // Verify read - { - RleDecoder decoder(buffer, len, bit_width); - for (size_t i = 0; i < values.size(); ++i) { - uint64_t val; - bool result = decoder.Get(&val); - EXPECT_TRUE(result); - EXPECT_EQ(values[i], val); - } - } - - // Verify batch read - { - RleDecoder decoder(buffer, len, bit_width); - vector values_read(values.size()); - ASSERT_EQ(values.size(), - decoder.GetBatch(values_read.data(), static_cast(values.size()))); - EXPECT_EQ(values, values_read); - } -} - -// A version of ValidateRle that round-trips the values and returns false if -// the returned values are not all the same -bool CheckRoundTrip(const vector& values, int bit_width) { - const int len = 64 * 1024; - uint8_t buffer[len]; - RleEncoder encoder(buffer, len, bit_width); - for (size_t i = 0; i < values.size(); ++i) { - bool result = encoder.Put(values[i]); - if (!result) { return false; } - } - int encoded_len = encoder.Flush(); - int out = 0; - - { - RleDecoder decoder(buffer, encoded_len, bit_width); - for (size_t i = 0; i < values.size(); ++i) { - EXPECT_TRUE(decoder.Get(&out)); - if (values[i] != out) { return false; } - } - } - - // Verify batch read - { - RleDecoder decoder(buffer, len, bit_width); - vector values_read(values.size()); - if (static_cast(values.size()) != - decoder.GetBatch(values_read.data(), static_cast(values.size()))) { - return false; - } - if (values != values_read) { return false; } - } - - return true; -} - -TEST(Rle, SpecificSequences) { - const int len = 1024; - uint8_t expected_buffer[len]; - vector values; - - // Test 50 0' followed by 50 1's - values.resize(100); - for (int i = 0; i < 50; ++i) { - values[i] = 0; - } - for (int i = 50; i < 100; ++i) { - values[i] = 1; - } - - // expected_buffer valid for bit width <= 1 byte - expected_buffer[0] = (50 << 1); - expected_buffer[1] = 0; - expected_buffer[2] = (50 << 1); - expected_buffer[3] = 1; - for (int width = 1; width <= 8; ++width) { - ValidateRle(values, width, expected_buffer, 4); - } - - for (int width = 9; width <= MAX_WIDTH; ++width) { - ValidateRle(values, width, NULL, 2 * (1 + static_cast(BitUtil::Ceil(width, 8)))); - } - - // Test 100 0's and 1's alternating - for (int i = 0; i < 100; ++i) { - values[i] = i % 2; - } - int num_groups = static_cast(BitUtil::Ceil(100, 8)); - expected_buffer[0] = (num_groups << 1) | 1; - for (int i = 1; i <= 100 / 8; ++i) { - expected_buffer[i] = BOOST_BINARY(1 0 1 0 1 0 1 0); - } - // Values for the last 4 0 and 1's. The upper 4 bits should be padded to 0. - expected_buffer[100 / 8 + 1] = BOOST_BINARY(0 0 0 0 1 0 1 0); - - // num_groups and expected_buffer only valid for bit width = 1 - ValidateRle(values, 1, expected_buffer, 1 + num_groups); - for (int width = 2; width <= MAX_WIDTH; ++width) { - int num_values = static_cast(BitUtil::Ceil(100, 8)) * 8; - ValidateRle( - values, width, NULL, 1 + static_cast(BitUtil::Ceil(width * num_values, 8))); - } -} - -// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value -// is used, otherwise alternating values are used. -void TestRleValues(int bit_width, int num_vals, int value = -1) { - const uint64_t mod = (bit_width == 64) ? 1 : 1LL << bit_width; - vector values; - for (int v = 0; v < num_vals; ++v) { - values.push_back((value != -1) ? value : (v % mod)); - } - ValidateRle(values, bit_width, NULL, -1); -} - -TEST(Rle, TestValues) { - for (int width = 1; width <= MAX_WIDTH; ++width) { - TestRleValues(width, 1); - TestRleValues(width, 1024); - TestRleValues(width, 1024, 0); - TestRleValues(width, 1024, 1); - } -} - -TEST(Rle, BitWidthZeroRepeated) { - uint8_t buffer[1]; - const int num_values = 15; - buffer[0] = num_values << 1; // repeated indicator byte - RleDecoder decoder(buffer, sizeof(buffer), 0); - uint8_t val; - for (int i = 0; i < num_values; ++i) { - bool result = decoder.Get(&val); - EXPECT_TRUE(result); - EXPECT_EQ(val, 0); // can only encode 0s with bit width 0 - } - EXPECT_FALSE(decoder.Get(&val)); -} - -TEST(Rle, BitWidthZeroLiteral) { - uint8_t buffer[1]; - const int num_groups = 4; - buffer[0] = num_groups << 1 | 1; // literal indicator byte - RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0); - const int num_values = num_groups * 8; - uint8_t val; - for (int i = 0; i < num_values; ++i) { - bool result = decoder.Get(&val); - EXPECT_TRUE(result); - EXPECT_EQ(val, 0); // can only encode 0s with bit width 0 - } - EXPECT_FALSE(decoder.Get(&val)); -} - -// Test that writes out a repeated group and then a literal -// group but flush before finishing. -TEST(BitRle, Flush) { - vector values; - for (int i = 0; i < 16; ++i) - values.push_back(1); - values.push_back(0); - ValidateRle(values, 1, NULL, -1); - values.push_back(1); - ValidateRle(values, 1, NULL, -1); - values.push_back(1); - ValidateRle(values, 1, NULL, -1); - values.push_back(1); - ValidateRle(values, 1, NULL, -1); -} - -// Test some random sequences. -TEST(BitRle, Random) { - int niters = 50; - int ngroups = 1000; - int max_group_size = 16; - vector values(ngroups + max_group_size); - - // prng setup - std::random_device rd; - std::uniform_int_distribution dist(1, 20); - - for (int iter = 0; iter < niters; ++iter) { - // generate a seed with device entropy - uint32_t seed = rd(); - std::mt19937 gen(seed); - - bool parity = 0; - values.resize(0); - - for (int i = 0; i < ngroups; ++i) { - int group_size = dist(gen); - if (group_size > max_group_size) { group_size = 1; } - for (int i = 0; i < group_size; ++i) { - values.push_back(parity); - } - parity = !parity; - } - if (!CheckRoundTrip(values, BitUtil::NumRequiredBits(values.size()))) { - FAIL() << "failing seed: " << seed; - } - } -} - -// Test a sequence of 1 0's, 2 1's, 3 0's. etc -// e.g. 011000111100000 -TEST(BitRle, RepeatedPattern) { - vector values; - const int min_run = 1; - const int max_run = 32; - - for (int i = min_run; i <= max_run; ++i) { - int v = i % 2; - for (int j = 0; j < i; ++j) { - values.push_back(v); - } - } - - // And go back down again - for (int i = max_run; i >= min_run; --i) { - int v = i % 2; - for (int j = 0; j < i; ++j) { - values.push_back(v); - } - } - - ValidateRle(values, 1, NULL, -1); -} - -TEST(BitRle, Overflow) { - for (int bit_width = 1; bit_width < 32; bit_width += 3) { - int len = RleEncoder::MinBufferSize(bit_width); - std::vector buffer(len); - int num_added = 0; - bool parity = true; - - RleEncoder encoder(buffer.data(), len, bit_width); - // Insert alternating true/false until there is no space left - while (true) { - bool result = encoder.Put(parity); - parity = !parity; - if (!result) break; - ++num_added; - } - - int bytes_written = encoder.Flush(); - EXPECT_LE(bytes_written, len); - EXPECT_GT(num_added, 0); - - RleDecoder decoder(buffer.data(), bytes_written, bit_width); - parity = true; - uint32_t v; - for (int i = 0; i < num_added; ++i) { - bool result = decoder.Get(&v); - EXPECT_TRUE(result); - EXPECT_EQ(v != 0, parity); - parity = !parity; - } - // Make sure we get false when reading past end a couple times. - EXPECT_FALSE(decoder.Get(&v)); - EXPECT_FALSE(decoder.Get(&v)); - } -} - -} // namespace parquet diff --git a/src/parquet/util/sse-util.h b/src/parquet/util/sse-util.h deleted file mode 100644 index 653c1714..00000000 --- a/src/parquet/util/sse-util.h +++ /dev/null @@ -1,237 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29. Pared down to a minimal set of -// functions needed for parquet-cpp - -#ifndef PARQUET_UTIL_SSE_UTIL_H -#define PARQUET_UTIL_SSE_UTIL_H - -#ifdef PARQUET_USE_SSE -#include -#endif - -namespace parquet { - -/// This class contains constants useful for text processing with SSE4.2 intrinsics. -namespace SSEUtil { -/// Number of characters that fit in 64/128 bit register. SSE provides instructions -/// for loading 64 or 128 bits into a register at a time. -static const int CHARS_PER_64_BIT_REGISTER = 8; -static const int CHARS_PER_128_BIT_REGISTER = 16; - -/// SSE4.2 adds instructions for text processing. The instructions have a control -/// byte that determines some of functionality of the instruction. (Equivalent to -/// GCC's _SIDD_CMP_EQUAL_ANY, etc). -static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr -static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp -static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16) -static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4. - -/// In this mode, SSE text processing functions will return a mask of all the -/// characters that matched. -static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS; - -/// In this mode, SSE text processing functions will return the number of -/// bytes that match consecutively from the beginning. -static const int STRCMP_MODE = - PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY; - -/// Precomputed mask values up to 16 bits. -static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = { - 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, - 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, -}; -} // namespace SSEUtil - -#ifdef PARQUET_USE_SSE - -/// Define the SSE 4.2 intrinsics. The caller must first verify at runtime (or codegen -/// IR load time) that the processor supports SSE 4.2 before calling these. These are -/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros. -#ifndef IR_COMPILE -/// When compiling to native code (i.e. not IR), we cannot use the -msse4.2 compiler -/// flag. Otherwise, the compiler will emit SSE 4.2 instructions outside of the runtime -/// SSE 4.2 checks and Impala will crash on CPUs that don't support SSE 4.2 -/// (IMPALA-1399/1646). The compiler intrinsics cannot be used without -msse4.2, so we -/// define our own implementations of the intrinsics instead. - -/// The PCMPxSTRy instructions require that the control byte 'mode' be encoded as an -/// immediate. So, those need to be always inlined in order to always propagate the -/// mode constant into the inline asm. -#define SSE_ALWAYS_INLINE inline __attribute__((__always_inline__)) - -template -static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { -#ifdef __clang__ - /// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 - - /// clang doesn't support Y-prefixed asm constraints. - register volatile __m128i result asm("xmm0"); - __asm__ volatile("pcmpestrm %5, %2, %1" - : "=x"(result) - : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) - : "cc"); -#else - __m128i result; - __asm__ volatile("pcmpestrm %5, %2, %1" - : "=Yz"(result) - : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) - : "cc"); -#endif - return result; -} - -template -static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { - int result; - __asm__("pcmpestri %5, %2, %1" - : "=c"(result) - : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) - : "cc"); - return result; -} - -static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { - __asm__("crc32b %1, %0" : "+r"(crc) : "rm"(v)); - return crc; -} - -static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) { - __asm__("crc32w %1, %0" : "+r"(crc) : "rm"(v)); - return crc; -} - -static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { - __asm__("crc32l %1, %0" : "+r"(crc) : "rm"(v)); - return crc; -} - -static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) { - uint64_t result = crc; - __asm__("crc32q %1, %0" : "+r"(result) : "rm"(v)); - return result; -} - -static inline int64_t POPCNT_popcnt_u64(uint64_t a) { - int64_t result; - __asm__("popcntq %1, %0" : "=r"(result) : "mr"(a) : "cc"); - return result; -} - -#undef SSE_ALWAYS_INLINE - -#elif defined(__SSE4_2__) // IR_COMPILE for SSE 4.2. -/// When cross-compiling to IR, we cannot use inline asm because LLVM JIT does not -/// support it. However, the cross-compiled IR is compiled twice: with and without -/// -msse4.2. When -msse4.2 is enabled in the cross-compile, we can just use the -/// compiler intrinsics. - -#include - -template -static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { - return _mm_cmpestrm(str1, len1, str2, len2, MODE); -} - -template -static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { - return _mm_cmpestri(str1, len1, str2, len2, MODE); -} - -#define SSE4_crc32_u8 _mm_crc32_u8 -#define SSE4_crc32_u16 _mm_crc32_u16 -#define SSE4_crc32_u32 _mm_crc32_u32 -#define SSE4_crc32_u64 _mm_crc32_u64 -#define POPCNT_popcnt_u64 _mm_popcnt_u64 - -#else // IR_COMPILE without SSE 4.2. -/// When cross-compiling to IR without SSE 4.2 support (i.e. no -msse4.2), we cannot use -/// SSE 4.2 instructions. Otherwise, the IR loading will fail on CPUs that don't -/// support SSE 4.2. However, because the caller isn't allowed to call these routines -/// on CPUs that lack SSE 4.2 anyway, we can implement stubs for this case. - -template -static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { - DCHECK(false) << "CPU doesn't support SSE 4.2"; - return (__m128i){0}; // NOLINT -} - -template -static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { - DCHECK(false) << "CPU doesn't support SSE 4.2"; - return 0; -} - -static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { - DCHECK(false) << "CPU doesn't support SSE 4.2"; - return 0; -} - -static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) { - DCHECK(false) << "CPU doesn't support SSE 4.2"; - return 0; -} - -static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { - DCHECK(false) << "CPU doesn't support SSE 4.2"; - return 0; -} - -static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) { - DCHECK(false) << "CPU doesn't support SSE 4.2"; - return 0; -} - -static inline int64_t POPCNT_popcnt_u64(uint64_t a) { - DCHECK(false) << "CPU doesn't support SSE 4.2"; - return 0; -} - -#endif // IR_COMPILE - -#else - -static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { - DCHECK(false) << "SSE support is not enabled"; - return 0; -} - -static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) { - DCHECK(false) << "SSE support is not enabled"; - return 0; -} - -static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { - DCHECK(false) << "SSE support is not enabled"; - return 0; -} - -static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) { - DCHECK(false) << "SSE support is not enabled"; - return 0; -} - -static inline int64_t POPCNT_popcnt_u64(uint64_t a) { - DCHECK(false) << "SSE support is not enabled"; - return 0; -} - -#endif // PARQUET_USE_SSE - -} // namespace parquet - -#endif // PARQUET_UTIL_SSE_UTIL_H