diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47984e6c..0627b00f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -618,8 +618,6 @@ set(LIBPARQUET_SRCS
 
   src/parquet/parquet_constants.cpp
   src/parquet/parquet_types.cpp
-
-  src/parquet/util/cpu-info.cc
   src/parquet/util/memory.cc
 )
 
diff --git a/benchmarks/decode_benchmark.cc b/benchmarks/decode_benchmark.cc
index 44776a8f..5514d8b7 100644
--- a/benchmarks/decode_benchmark.cc
+++ b/benchmarks/decode_benchmark.cc
@@ -40,10 +40,10 @@ class DeltaBitPackEncoder {
 
   uint8_t* Encode(int* encoded_len) {
     uint8_t* result = new uint8_t[10 * 1024 * 1024];
-    int num_mini_blocks = parquet::BitUtil::Ceil(num_values() - 1, mini_block_size_);
+    int num_mini_blocks = arrow::BitUtil::Ceil(num_values() - 1, mini_block_size_);
     uint8_t* mini_block_widths = NULL;
 
-    parquet::BitWriter writer(result, 10 * 1024 * 1024);
+    arrow::BitWriter writer(result, 10 * 1024 * 1024);
 
     // Writer the size of each block. We only use 1 block currently.
     writer.PutVlqInt(num_mini_blocks * mini_block_size_);
@@ -83,7 +83,7 @@ class DeltaBitPackEncoder {
 
       // The bit width for this block is the number of bits needed to store
       // (max_delta - min_delta).
-      int bit_width = parquet::BitUtil::NumRequiredBits(max_delta - min_delta);
+      int bit_width = arrow::BitUtil::NumRequiredBits(max_delta - min_delta);
       mini_block_widths[i] = bit_width;
 
       // Encode this mini blocking using min_delta and bit_width
diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake
index 5c4e5653..716debcb 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -109,34 +109,6 @@ set(LIBS ${LIBS} ${Boost_LIBRARIES})
 # ----------------------------------------------------------------------
 # ZLIB
 
-set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install")
-set(ZLIB_HOME "${ZLIB_PREFIX}")
-set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include")
-if (MSVC)
-  if (${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG")
-    set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib)
-  else()
-    set(ZLIB_STATIC_LIB_NAME zlibstatic.lib)
-  endif()
-else()
-  set(ZLIB_STATIC_LIB_NAME libz.a)
-endif()
-set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}")
-set(ZLIB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-  -DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}
-  -DCMAKE_C_FLAGS=${EP_C_FLAGS}
-  -DBUILD_SHARED_LIBS=OFF)
-ExternalProject_Add(zlib_ep
-  URL "http://zlib.net/fossils/zlib-1.2.8.tar.gz"
-  BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}"
-  ${ZLIB_BUILD_BYPRODUCTS}
-  CMAKE_ARGS ${ZLIB_CMAKE_ARGS})
-
-include_directories(SYSTEM ${ZLIB_INCLUDE_DIR})
-add_library(zlibstatic STATIC IMPORTED)
-set_target_properties(zlibstatic PROPERTIES IMPORTED_LOCATION ${ZLIB_STATIC_LIB})
-add_dependencies(zlibstatic zlib_ep)
-
 # ----------------------------------------------------------------------
 # Thrift
 
@@ -144,6 +116,29 @@ add_dependencies(zlibstatic zlib_ep)
 find_package(Thrift)
 
 if (NOT THRIFT_FOUND)
+  set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install")
+  set(ZLIB_HOME "${ZLIB_PREFIX}")
+  set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include")
+  if (MSVC)
+    if (${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG")
+      set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib)
+    else()
+      set(ZLIB_STATIC_LIB_NAME zlibstatic.lib)
+    endif()
+  else()
+    set(ZLIB_STATIC_LIB_NAME libz.a)
+  endif()
+  set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}")
+  set(ZLIB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    -DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}
+    -DCMAKE_C_FLAGS=${EP_C_FLAGS}
+    -DBUILD_SHARED_LIBS=OFF)
+  ExternalProject_Add(zlib_ep
+    URL "http://zlib.net/fossils/zlib-1.2.8.tar.gz"
+    BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}"
+    ${ZLIB_BUILD_BYPRODUCTS}
+    CMAKE_ARGS ${ZLIB_CMAKE_ARGS})
+
   set(THRIFT_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/thrift_ep/src/thrift_ep-install")
   set(THRIFT_HOME "${THRIFT_PREFIX}")
   set(THRIFT_INCLUDE_DIR "${THRIFT_PREFIX}/include")
@@ -341,7 +336,7 @@ if (NOT ARROW_FOUND)
     -DARROW_BUILD_TESTS=OFF)
 
   if ("$ENV{PARQUET_ARROW_VERSION}" STREQUAL "")
-    set(ARROW_VERSION "98f7cac6e162d9775d615d07b9867c1ec0030f82")
+    set(ARROW_VERSION "a58893882ac8acd1ac4a5036685cbf09a9a09673")
   else()
     set(ARROW_VERSION "$ENV{PARQUET_ARROW_VERSION}")
   endif()
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index a3a26c93..dd1c9d26 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -26,12 +26,12 @@
 #include <thread>
 #include <vector>
 
+#include "arrow/api.h"
+#include "arrow/util/bit-util.h"
+
 #include "parquet/arrow/schema.h"
-#include "parquet/util/bit-util.h"
 #include "parquet/util/schema-util.h"
 
-#include "arrow/api.h"
-
 using arrow::Array;
 using arrow::BooleanArray;
 using arrow::Column;
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index b8cb45cf..1e3f6de1 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -21,14 +21,13 @@
 #include <string>
 #include <vector>
 
-#include "parquet/util/bit-util.h"
-#include "parquet/util/logging.h"
-
-#include "parquet/arrow/schema.h"
-
 #include "arrow/api.h"
+#include "arrow/util/bit-util.h"
 #include "arrow/visitor_inline.h"
 
+#include "parquet/arrow/schema.h"
+#include "parquet/util/logging.h"
+
 using arrow::Array;
 using arrow::BinaryArray;
 using arrow::FixedSizeBinaryArray;
diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc
index f63f6f18..ce6936dd 100644
--- a/src/parquet/column_reader.cc
+++ b/src/parquet/column_reader.cc
@@ -21,10 +21,11 @@
 #include <cstdint>
 #include <memory>
 
+#include "arrow/util/rle-encoding.h"
+
 #include "parquet/column_page.h"
 #include "parquet/encoding-internal.h"
 #include "parquet/properties.h"
-#include "parquet/util/rle-encoding.h"
 
 using arrow::MemoryPool;
 
@@ -45,7 +46,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
       num_bytes = *reinterpret_cast<const int32_t*>(data);
       const uint8_t* decoder_data = data + sizeof(int32_t);
       if (!rle_decoder_) {
-        rle_decoder_.reset(new RleDecoder(decoder_data, num_bytes, bit_width_));
+        rle_decoder_.reset(new ::arrow::RleDecoder(decoder_data, num_bytes, bit_width_));
       } else {
         rle_decoder_->Reset(decoder_data, num_bytes, bit_width_);
       }
@@ -55,7 +56,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
       num_bytes =
           static_cast<int32_t>(BitUtil::Ceil(num_buffered_values * bit_width_, 8));
       if (!bit_packed_decoder_) {
-        bit_packed_decoder_.reset(new BitReader(data, num_bytes));
+        bit_packed_decoder_.reset(new ::arrow::BitReader(data, num_bytes));
       } else {
         bit_packed_decoder_->Reset(data, num_bytes);
       }
diff --git a/src/parquet/column_reader.h b/src/parquet/column_reader.h
index f4b8b02b..e733d67a 100644
--- a/src/parquet/column_reader.h
+++ b/src/parquet/column_reader.h
@@ -36,11 +36,15 @@
 #include "parquet/util/memory.h"
 #include "parquet/util/visibility.h"
 
-namespace parquet {
+namespace arrow {
 
 class BitReader;
 class RleDecoder;
 
+}  // namespace arrow
+
+namespace parquet {
+
 class PARQUET_EXPORT LevelDecoder {
  public:
   LevelDecoder();
@@ -58,8 +62,8 @@ class PARQUET_EXPORT LevelDecoder {
   int bit_width_;
   int num_values_remaining_;
   Encoding::type encoding_;
-  std::unique_ptr<RleDecoder> rle_decoder_;
-  std::unique_ptr<BitReader> bit_packed_decoder_;
+  std::unique_ptr<::arrow::RleDecoder> rle_decoder_;
+  std::unique_ptr<::arrow::BitReader> bit_packed_decoder_;
 };
 
 class PARQUET_EXPORT ColumnReader {
diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc
index c13d4a0f..21550da3 100644
--- a/src/parquet/column_writer.cc
+++ b/src/parquet/column_writer.cc
@@ -17,15 +17,20 @@
 
 #include "parquet/column_writer.h"
 
+#include "arrow/util/bit-util.h"
+#include "arrow/util/rle-encoding.h"
+
 #include "parquet/encoding-internal.h"
 #include "parquet/properties.h"
 #include "parquet/statistics.h"
 #include "parquet/util/logging.h"
 #include "parquet/util/memory.h"
-#include "parquet/util/rle-encoding.h"
 
 namespace parquet {
 
+using BitWriter = ::arrow::BitWriter;
+using RleEncoder = ::arrow::RleEncoder;
+
 LevelEncoder::LevelEncoder() {}
 LevelEncoder::~LevelEncoder() {}
 
diff --git a/src/parquet/column_writer.h b/src/parquet/column_writer.h
index 4e113de1..1637780e 100644
--- a/src/parquet/column_writer.h
+++ b/src/parquet/column_writer.h
@@ -30,11 +30,15 @@
 #include "parquet/util/memory.h"
 #include "parquet/util/visibility.h"
 
-namespace parquet {
+namespace arrow {
 
 class BitWriter;
 class RleEncoder;
 
+}  // namespace arrow
+
+namespace parquet {
+
 class PARQUET_EXPORT LevelEncoder {
  public:
   LevelEncoder();
@@ -61,8 +65,8 @@ class PARQUET_EXPORT LevelEncoder {
   int bit_width_;
   int rle_length_;
   Encoding::type encoding_;
-  std::unique_ptr<RleEncoder> rle_encoder_;
-  std::unique_ptr<BitWriter> bit_packed_encoder_;
+  std::unique_ptr<::arrow::RleEncoder> rle_encoder_;
+  std::unique_ptr<::arrow::BitWriter> bit_packed_encoder_;
 };
 
 static constexpr int WRITE_BATCH_SIZE = 1000;
diff --git a/src/parquet/encoding-internal.h b/src/parquet/encoding-internal.h
index 61b8e242..88d781f9 100644
--- a/src/parquet/encoding-internal.h
+++ b/src/parquet/encoding-internal.h
@@ -24,21 +24,23 @@
 #include <memory>
 #include <vector>
 
-#include <arrow/util/bit-util.h>
+#include "arrow/util/bit-stream-utils.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/cpu-info.h"
+#include "arrow/util/hash-util.h"
+#include "arrow/util/rle-encoding.h"
 
 #include "parquet/encoding.h"
 #include "parquet/exception.h"
 #include "parquet/schema.h"
 #include "parquet/types.h"
-#include "parquet/util/bit-stream-utils.inline.h"
-#include "parquet/util/bit-util.h"
-#include "parquet/util/cpu-info.h"
-#include "parquet/util/hash-util.h"
 #include "parquet/util/memory.h"
-#include "parquet/util/rle-encoding.h"
 
 namespace parquet {
 
+namespace BitUtil = ::arrow::BitUtil;
+using HashUtil = ::arrow::HashUtil;
+
 class ColumnDescriptor;
 
 // ----------------------------------------------------------------------
@@ -136,7 +138,7 @@ class PlainDecoder<BooleanType> : public Decoder<BooleanType> {
 
   virtual void SetData(int num_values, const uint8_t* data, int len) {
     num_values_ = num_values;
-    bit_reader_ = BitReader(data, len);
+    bit_reader_ = ::arrow::BitReader(data, len);
   }
 
   // Two flavors of bool decoding
@@ -161,7 +163,7 @@ class PlainDecoder<BooleanType> : public Decoder<BooleanType> {
   }
 
  private:
-  BitReader bit_reader_;
+  ::arrow::BitReader bit_reader_;
 };
 
 // ----------------------------------------------------------------------
@@ -196,7 +198,7 @@ class PlainEncoder<BooleanType> : public Encoder<BooleanType> {
         bits_available_(kInMemoryDefaultCapacity * 8),
         bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)),
         values_sink_(new InMemoryOutputStream(pool)) {
-    bit_writer_.reset(new BitWriter(
+    bit_writer_.reset(new ::arrow::BitWriter(
         bits_buffer_->mutable_data(), static_cast<int>(bits_buffer_->size())));
   }
 
@@ -260,7 +262,7 @@ class PlainEncoder<BooleanType> : public Encoder<BooleanType> {
 
  protected:
   int bits_available_;
-  std::unique_ptr<BitWriter> bit_writer_;
+  std::unique_ptr<::arrow::BitWriter> bit_writer_;
   std::shared_ptr<PoolBuffer> bits_buffer_;
   std::unique_ptr<InMemoryOutputStream> values_sink_;
 };
@@ -325,12 +327,13 @@ class DictionaryDecoder : public Decoder<Type> {
     uint8_t bit_width = *data;
     ++data;
     --len;
-    idx_decoder_ = RleDecoder(data, len, bit_width);
+    idx_decoder_ = ::arrow::RleDecoder(data, len, bit_width);
   }
 
   int Decode(T* buffer, int max_values) override {
     max_values = std::min(max_values, num_values_);
-    int decoded_values = idx_decoder_.GetBatchWithDict(dictionary_, buffer, max_values);
+    int decoded_values =
+        idx_decoder_.GetBatchWithDict(dictionary_.data(), buffer, max_values);
     if (decoded_values != max_values) { ParquetException::EofException(); }
     num_values_ -= max_values;
     return max_values;
@@ -338,8 +341,8 @@ class DictionaryDecoder : public Decoder<Type> {
 
   int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits,
       int64_t valid_bits_offset) override {
-    int decoded_values = idx_decoder_.GetBatchWithDictSpaced(
-        dictionary_, buffer, num_values, null_count, valid_bits, valid_bits_offset);
+    int decoded_values = idx_decoder_.GetBatchWithDictSpaced(dictionary_.data(), buffer,
+        num_values, null_count, valid_bits, valid_bits_offset);
     if (decoded_values != num_values) { ParquetException::EofException(); }
     return decoded_values;
   }
@@ -354,7 +357,7 @@ class DictionaryDecoder : public Decoder<Type> {
   // pointers).
   std::shared_ptr<PoolBuffer> byte_array_data_;
 
-  RleDecoder idx_decoder_;
+  ::arrow::RleDecoder idx_decoder_;
 };
 
 template <typename Type>
@@ -446,7 +449,7 @@ class DictEncoder : public Encoder<DType> {
         dict_encoded_size_(0),
         type_length_(desc->type_length()) {
     hash_slots_.Assign(hash_table_size_, HASH_SLOT_EMPTY);
-    if (!CpuInfo::initialized()) { CpuInfo::Init(); }
+    if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); }
   }
 
   virtual ~DictEncoder() { DCHECK(buffered_indices_.empty()); }
@@ -464,9 +467,9 @@ class DictEncoder : public Encoder<DType> {
     // reserve
     // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used
     // but not reserving them would cause the encoder to fail.
-    return 1 + RleEncoder::MaxBufferSize(
+    return 1 + ::arrow::RleEncoder::MaxBufferSize(
                    bit_width(), static_cast<int>(buffered_indices_.size())) +
-           RleEncoder::MinBufferSize(bit_width());
+           ::arrow::RleEncoder::MinBufferSize(bit_width());
   }
 
   /// The minimum bit width required to encode the currently buffered indices.
@@ -727,7 +730,7 @@ inline int DictEncoder<DType>::WriteIndices(uint8_t* buffer, int buffer_len) {
   ++buffer;
   --buffer_len;
 
-  RleEncoder encoder(buffer, buffer_len, bit_width());
+  ::arrow::RleEncoder encoder(buffer, buffer_len, bit_width());
   for (int index : buffered_indices_) {
     if (!encoder.Put(index)) return -1;
   }
@@ -756,7 +759,7 @@ class DeltaBitPackDecoder : public Decoder<DType> {
 
   virtual void SetData(int num_values, const uint8_t* data, int len) {
     num_values_ = num_values;
-    decoder_ = BitReader(data, len);
+    decoder_ = ::arrow::BitReader(data, len);
     values_current_block_ = 0;
     values_current_mini_block_ = 0;
   }
@@ -819,7 +822,7 @@ class DeltaBitPackDecoder : public Decoder<DType> {
     return max_values;
   }
 
-  BitReader decoder_;
+  ::arrow::BitReader decoder_;
   int32_t values_current_block_;
   int32_t num_mini_blocks_;
   uint64_t values_per_mini_block_;
diff --git a/src/parquet/encoding-test.cc b/src/parquet/encoding-test.cc
index 2e780365..dcd813d3 100644
--- a/src/parquet/encoding-test.cc
+++ b/src/parquet/encoding-test.cc
@@ -22,10 +22,11 @@
 #include <string>
 #include <vector>
 
+#include "arrow/util/bit-util.h"
+
 #include "parquet/encoding-internal.h"
 #include "parquet/schema.h"
 #include "parquet/types.h"
-#include "parquet/util/bit-util.h"
 #include "parquet/util/memory.h"
 #include "parquet/util/test-common.h"
 
@@ -64,7 +65,7 @@ TEST(VectorBooleanTest, TestEncodeDecode) {
   ASSERT_EQ(nvalues, values_decoded);
 
   for (int i = 0; i < nvalues; ++i) {
-    ASSERT_EQ(draws[i], BitUtil::GetArrayBit(decode_data, i)) << i;
+    ASSERT_EQ(draws[i], BitUtil::GetBit(decode_data, i)) << i;
   }
 }
 
diff --git a/src/parquet/encoding.h b/src/parquet/encoding.h
index 7c51cfd1..1417e984 100644
--- a/src/parquet/encoding.h
+++ b/src/parquet/encoding.h
@@ -21,12 +21,11 @@
 #include <cstdint>
 #include <memory>
 
-#include <arrow/util/bit-util.h>
+#include "arrow/util/bit-util.h"
 
 #include "parquet/exception.h"
 #include "parquet/schema.h"
 #include "parquet/types.h"
-#include "parquet/util/bit-util.h"
 #include "parquet/util/memory.h"
 
 namespace parquet {
diff --git a/src/parquet/types.h b/src/parquet/types.h
index 7ec38250..38015c4d 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -24,7 +24,8 @@
 #include <sstream>
 #include <string>
 
-#include "parquet/util/compiler-util.h"
+#include "arrow/util/compiler-util.h"
+
 #include "parquet/util/visibility.h"
 
 namespace parquet {
diff --git a/src/parquet/util/CMakeLists.txt b/src/parquet/util/CMakeLists.txt
index e8fbdc74..47172667 100644
--- a/src/parquet/util/CMakeLists.txt
+++ b/src/parquet/util/CMakeLists.txt
@@ -17,19 +17,11 @@
 
 # Headers: util
 install(FILES
-  bit-stream-utils.h
-  bit-stream-utils.inline.h
-  bit-util.h
   buffer-builder.h
-  compiler-util.h
-  cpu-info.h
-  hash-util.h
   logging.h
   macros.h
   memory.h
-  rle-encoding.h
   stopwatch.h
-  sse-util.h
   visibility.h
   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/util")
 
@@ -52,7 +44,5 @@ if (PARQUET_BUILD_BENCHMARKS)
   endif()
 endif()
 
-ADD_PARQUET_TEST(bit-util-test)
 ADD_PARQUET_TEST(comparison-test)
 ADD_PARQUET_TEST(memory-test)
-ADD_PARQUET_TEST(rle-test)
diff --git a/src/parquet/util/bit-stream-utils.h b/src/parquet/util/bit-stream-utils.h
deleted file mode 100644
index 497a9606..00000000
--- a/src/parquet/util/bit-stream-utils.h
+++ /dev/null
@@ -1,170 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29
-
-#ifndef PARQUET_UTIL_BIT_STREAM_UTILS_H
-#define PARQUET_UTIL_BIT_STREAM_UTILS_H
-
-#include <algorithm>
-#include <cstdint>
-#include <string.h>
-
-#include "parquet/util/bit-util.h"
-#include "parquet/util/compiler-util.h"
-#include "parquet/util/logging.h"
-
-namespace parquet {
-
-/// Utility class to write bit/byte streams.  This class can write data to either be
-/// bit packed or byte aligned (and a single stream that has a mix of both).
-/// This class does not allocate memory.
-class BitWriter {
- public:
-  /// buffer: buffer to write bits to.  Buffer should be preallocated with
-  /// 'buffer_len' bytes.
-  BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) {
-    Clear();
-  }
-
-  void Clear() {
-    buffered_values_ = 0;
-    byte_offset_ = 0;
-    bit_offset_ = 0;
-  }
-
-  /// The number of current bytes written, including the current byte (i.e. may include a
-  /// fraction of a byte). Includes buffered values.
-  int bytes_written() const {
-    return byte_offset_ + static_cast<int>(BitUtil::Ceil(bit_offset_, 8));
-  }
-  uint8_t* buffer() const { return buffer_; }
-  int buffer_len() const { return max_bytes_; }
-
-  /// Writes a value to buffered_values_, flushing to buffer_ if necessary.  This is bit
-  /// packed.  Returns false if there was not enough space. num_bits must be <= 32.
-  bool PutValue(uint64_t v, int num_bits);
-
-  /// Writes v to the next aligned byte using num_bytes. If T is larger than
-  /// num_bytes, the extra high-order bytes will be ignored. Returns false if
-  /// there was not enough space.
-  template <typename T>
-  bool PutAligned(T v, int num_bytes);
-
-  /// Write a Vlq encoded int to the buffer.  Returns false if there was not enough
-  /// room.  The value is written byte aligned.
-  /// For more details on vlq:
-  /// en.wikipedia.org/wiki/Variable-length_quantity
-  bool PutVlqInt(uint32_t v);
-
-  // Writes an int zigzag encoded.
-  bool PutZigZagVlqInt(int32_t v);
-
-  /// Get a pointer to the next aligned byte and advance the underlying buffer
-  /// by num_bytes.
-  /// Returns NULL if there was not enough space.
-  uint8_t* GetNextBytePtr(int num_bytes = 1);
-
-  /// Flushes all buffered values to the buffer. Call this when done writing to
-  /// the buffer.  If 'align' is true, buffered_values_ is reset and any future
-  /// writes will be written to the next byte boundary.
-  void Flush(bool align = false);
-
- private:
-  uint8_t* buffer_;
-  int max_bytes_;
-
-  /// Bit-packed values are initially written to this variable before being memcpy'd to
-  /// buffer_. This is faster than writing values byte by byte directly to buffer_.
-  uint64_t buffered_values_;
-
-  int byte_offset_;  // Offset in buffer_
-  int bit_offset_;   // Offset in buffered_values_
-};
-
-/// Utility class to read bit/byte stream.  This class can read bits or bytes
-/// that are either byte aligned or not.  It also has utilities to read multiple
-/// bytes in one read (e.g. encoded int).
-class BitReader {
- public:
-  /// 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
-  BitReader(const uint8_t* buffer, int buffer_len)
-      : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) {
-    int num_bytes = std::min(8, max_bytes_ - byte_offset_);
-    memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
-  }
-
-  BitReader() : buffer_(NULL), max_bytes_(0) {}
-
-  void Reset(const uint8_t* buffer, int buffer_len) {
-    buffer_ = buffer;
-    max_bytes_ = buffer_len;
-    byte_offset_ = 0;
-    bit_offset_ = 0;
-    int num_bytes = std::min(8, max_bytes_ - byte_offset_);
-    memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
-  }
-
-  /// Gets the next value from the buffer.  Returns true if 'v' could be read or false if
-  /// there are not enough bytes left. num_bits must be <= 32.
-  template <typename T>
-  bool GetValue(int num_bits, T* v);
-
-  /// Get a number of values from the buffer. Return the number of values actually read.
-  template <typename T>
-  int GetBatch(int num_bits, T* v, int batch_size);
-
-  /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
-  /// needs to be a little-endian native type and big enough to store
-  /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
-  /// be advanced to the start of the next byte before 'v' is read. Returns
-  /// false if there are not enough bytes left.
-  template <typename T>
-  bool GetAligned(int num_bytes, T* v);
-
-  /// Reads a vlq encoded int from the stream.  The encoded int must start at
-  /// the beginning of a byte. Return false if there were not enough bytes in
-  /// the buffer.
-  bool GetVlqInt(int32_t* v);
-
-  // Reads a zigzag encoded int `into` v.
-  bool GetZigZagVlqInt(int32_t* v);
-
-  /// Returns the number of bytes left in the stream, not including the current
-  /// byte (i.e., there may be an additional fraction of a byte).
-  int bytes_left() {
-    return max_bytes_ - (byte_offset_ + static_cast<int>(BitUtil::Ceil(bit_offset_, 8)));
-  }
-
-  /// Maximum byte length of a vlq encoded int
-  static const int MAX_VLQ_BYTE_LEN = 5;
-
- private:
-  const uint8_t* buffer_;
-  int max_bytes_;
-
-  /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
-  /// faster than reading values byte by byte directly from buffer_.
-  uint64_t buffered_values_;
-
-  int byte_offset_;  // Offset in buffer_
-  int bit_offset_;   // Offset in buffered_values_
-};
-
-}  // namespace parquet
-
-#endif  // PARQUET_UTIL_BIT_STREAM_UTILS_H
diff --git a/src/parquet/util/bit-stream-utils.inline.h b/src/parquet/util/bit-stream-utils.inline.h
deleted file mode 100644
index 5db16399..00000000
--- a/src/parquet/util/bit-stream-utils.inline.h
+++ /dev/null
@@ -1,258 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29
-
-#ifndef PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
-#define PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
-
-#include <algorithm>
-
-#include "parquet/util/bit-stream-utils.h"
-#include "parquet/util/bpacking.h"
-
-namespace parquet {
-
-inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
-  // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases)
-  DCHECK_LE(num_bits, 32);
-  DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
-
-  if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false;
-
-  buffered_values_ |= v << bit_offset_;
-  bit_offset_ += num_bits;
-
-  if (UNLIKELY(bit_offset_ >= 64)) {
-    // Flush buffered_values_ and write out bits of v that did not fit
-    memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
-    buffered_values_ = 0;
-    byte_offset_ += 8;
-    bit_offset_ -= 64;
-    buffered_values_ = v >> (num_bits - bit_offset_);
-  }
-  DCHECK_LT(bit_offset_, 64);
-  return true;
-}
-
-inline void BitWriter::Flush(bool align) {
-  int num_bytes = static_cast<int>(BitUtil::Ceil(bit_offset_, 8));
-  DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
-  memcpy(buffer_ + byte_offset_, &buffered_values_, num_bytes);
-
-  if (align) {
-    buffered_values_ = 0;
-    byte_offset_ += num_bytes;
-    bit_offset_ = 0;
-  }
-}
-
-inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) {
-  Flush(/* align */ true);
-  DCHECK_LE(byte_offset_, max_bytes_);
-  if (byte_offset_ + num_bytes > max_bytes_) return NULL;
-  uint8_t* ptr = buffer_ + byte_offset_;
-  byte_offset_ += num_bytes;
-  return ptr;
-}
-
-template <typename T>
-inline bool BitWriter::PutAligned(T val, int num_bytes) {
-  uint8_t* ptr = GetNextBytePtr(num_bytes);
-  if (ptr == NULL) return false;
-  memcpy(ptr, &val, num_bytes);
-  return true;
-}
-
-inline bool BitWriter::PutVlqInt(uint32_t v) {
-  bool result = true;
-  while ((v & 0xFFFFFF80) != 0L) {
-    result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1);
-    v >>= 7;
-  }
-  result &= PutAligned<uint8_t>(v & 0x7F, 1);
-  return result;
-}
-
-template <typename T>
-inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
-    int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800)
-#endif
-  *v = static_cast<T>(
-      BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >> *bit_offset);
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-  *bit_offset += num_bits;
-  if (*bit_offset >= 64) {
-    *byte_offset += 8;
-    *bit_offset -= 64;
-
-    int bytes_remaining = max_bytes - *byte_offset;
-    if (LIKELY(bytes_remaining >= 8)) {
-      memcpy(buffered_values, buffer + *byte_offset, 8);
-    } else {
-      memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
-    }
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800 4805)
-#endif
-    // Read bits of v that crossed into new buffered_values_
-    *v |= BitUtil::TrailingBits(*buffered_values, *bit_offset)
-          << (num_bits - *bit_offset);
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-    DCHECK_LE(*bit_offset, 64);
-  }
-}
-
-template <typename T>
-inline bool BitReader::GetValue(int num_bits, T* v) {
-  return GetBatch(num_bits, v, 1) == 1;
-}
-
-template <typename T>
-inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
-  DCHECK(buffer_ != NULL);
-  // TODO: revisit this limit if necessary
-  DCHECK_LE(num_bits, 32);
-  DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8));
-
-  int bit_offset = bit_offset_;
-  int byte_offset = byte_offset_;
-  uint64_t buffered_values = buffered_values_;
-  int max_bytes = max_bytes_;
-  const uint8_t* buffer = buffer_;
-
-  uint64_t needed_bits = num_bits * batch_size;
-  uint64_t remaining_bits = (max_bytes - byte_offset) * 8 - bit_offset;
-  if (remaining_bits < needed_bits) {
-    batch_size = static_cast<int>(remaining_bits) / num_bits;
-  }
-
-  int i = 0;
-  if (UNLIKELY(bit_offset != 0)) {
-    for (; i < batch_size && bit_offset != 0; ++i) {
-      GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
-          &buffered_values);
-    }
-  }
-
-  if (sizeof(T) == 4) {
-    int num_unpacked = unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
-        reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
-    i += num_unpacked;
-    byte_offset += num_unpacked * num_bits / 8;
-  } else {
-    const int buffer_size = 1024;
-    uint32_t unpack_buffer[buffer_size];
-    while (i < batch_size) {
-      int unpack_size = std::min(buffer_size, batch_size - i);
-      int num_unpacked = unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
-          unpack_buffer, unpack_size, num_bits);
-      if (num_unpacked == 0) { break; }
-      for (int k = 0; k < num_unpacked; ++k) {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800)
-#endif
-        v[i + k] = unpack_buffer[k];
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-      }
-      i += num_unpacked;
-      byte_offset += num_unpacked * num_bits / 8;
-    }
-  }
-
-  int bytes_remaining = max_bytes - byte_offset;
-  if (bytes_remaining >= 8) {
-    memcpy(&buffered_values, buffer + byte_offset, 8);
-  } else {
-    memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
-  }
-
-  for (; i < batch_size; ++i) {
-    GetValue_(
-        num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, &buffered_values);
-  }
-
-  bit_offset_ = bit_offset;
-  byte_offset_ = byte_offset;
-  buffered_values_ = buffered_values;
-
-  return batch_size;
-}
-
-template <typename T>
-inline bool BitReader::GetAligned(int num_bytes, T* v) {
-  DCHECK_LE(num_bytes, static_cast<int>(sizeof(T)));
-  int bytes_read = static_cast<int>(BitUtil::Ceil(bit_offset_, 8));
-  if (UNLIKELY(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false;
-
-  // Advance byte_offset to next unread byte and read num_bytes
-  byte_offset_ += bytes_read;
-  memcpy(v, buffer_ + byte_offset_, num_bytes);
-  byte_offset_ += num_bytes;
-
-  // Reset buffered_values_
-  bit_offset_ = 0;
-  int bytes_remaining = max_bytes_ - byte_offset_;
-  if (LIKELY(bytes_remaining >= 8)) {
-    memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
-  } else {
-    memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
-  }
-  return true;
-}
-
-inline bool BitReader::GetVlqInt(int32_t* v) {
-  *v = 0;
-  int shift = 0;
-  int num_bytes = 0;
-  uint8_t byte = 0;
-  do {
-    if (!GetAligned<uint8_t>(1, &byte)) return false;
-    *v |= (byte & 0x7F) << shift;
-    shift += 7;
-    DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN);
-  } while ((byte & 0x80) != 0);
-  return true;
-}
-
-inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
-  uint32_t u = (v << 1) ^ (v >> 31);
-  return PutVlqInt(u);
-}
-
-inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
-  int32_t u_signed;
-  if (!GetVlqInt(&u_signed)) return false;
-  uint32_t u = static_cast<uint32_t>(u_signed);
-  *reinterpret_cast<uint32_t*>(v) = (u >> 1) ^ -(static_cast<int32_t>(u & 1));
-  return true;
-}
-
-}  // namespace parquet
-
-#endif  // PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
diff --git a/src/parquet/util/bit-util-test.cc b/src/parquet/util/bit-util-test.cc
deleted file mode 100644
index bc3e182c..00000000
--- a/src/parquet/util/bit-util-test.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29
-
-#include <gtest/gtest.h>
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <boost/utility.hpp>
-
-#include <iostream>
-
-#include "parquet/util/bit-stream-utils.inline.h"
-#include "parquet/util/bit-util.h"
-#include "parquet/util/cpu-info.h"
-
-namespace parquet {
-
-static void ensure_cpu_info_initialized() {
-  if (!CpuInfo::initialized()) { CpuInfo::Init(); }
-}
-
-TEST(BitUtil, Ceil) {
-  EXPECT_EQ(BitUtil::Ceil(0, 1), 0);
-  EXPECT_EQ(BitUtil::Ceil(1, 1), 1);
-  EXPECT_EQ(BitUtil::Ceil(1, 2), 1);
-  EXPECT_EQ(BitUtil::Ceil(1, 8), 1);
-  EXPECT_EQ(BitUtil::Ceil(7, 8), 1);
-  EXPECT_EQ(BitUtil::Ceil(8, 8), 1);
-  EXPECT_EQ(BitUtil::Ceil(9, 8), 2);
-  EXPECT_EQ(BitUtil::Ceil(9, 9), 1);
-  EXPECT_EQ(BitUtil::Ceil(10000000000, 10), 1000000000);
-  EXPECT_EQ(BitUtil::Ceil(10, 10000000000), 1);
-  EXPECT_EQ(BitUtil::Ceil(100000000000, 10000000000), 10);
-}
-
-TEST(BitUtil, RoundUp) {
-  EXPECT_EQ(BitUtil::RoundUp(0, 1), 0);
-  EXPECT_EQ(BitUtil::RoundUp(1, 1), 1);
-  EXPECT_EQ(BitUtil::RoundUp(1, 2), 2);
-  EXPECT_EQ(BitUtil::RoundUp(6, 2), 6);
-  EXPECT_EQ(BitUtil::RoundUp(7, 3), 9);
-  EXPECT_EQ(BitUtil::RoundUp(9, 9), 9);
-  EXPECT_EQ(BitUtil::RoundUp(10000000001, 10), 10000000010);
-  EXPECT_EQ(BitUtil::RoundUp(10, 10000000000), 10000000000);
-  EXPECT_EQ(BitUtil::RoundUp(100000000000, 10000000000), 100000000000);
-}
-
-TEST(BitUtil, RoundDown) {
-  EXPECT_EQ(BitUtil::RoundDown(0, 1), 0);
-  EXPECT_EQ(BitUtil::RoundDown(1, 1), 1);
-  EXPECT_EQ(BitUtil::RoundDown(1, 2), 0);
-  EXPECT_EQ(BitUtil::RoundDown(6, 2), 6);
-  EXPECT_EQ(BitUtil::RoundDown(7, 3), 6);
-  EXPECT_EQ(BitUtil::RoundDown(9, 9), 9);
-  EXPECT_EQ(BitUtil::RoundDown(10000000001, 10), 10000000000);
-  EXPECT_EQ(BitUtil::RoundDown(10, 10000000000), 0);
-  EXPECT_EQ(BitUtil::RoundDown(100000000000, 10000000000), 100000000000);
-}
-
-TEST(BitUtil, Popcount) {
-  ensure_cpu_info_initialized();
-
-  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4);
-  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4);
-  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6);
-  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6);
-  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8);
-  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8);
-  EXPECT_EQ(BitUtil::Popcount(0), 0);
-  EXPECT_EQ(BitUtil::PopcountNoHw(0), 0);
-}
-
-TEST(BitUtil, TrailingBits) {
-  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 0), 0);
-  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 1), 1);
-  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 64),
-      BOOST_BINARY(1 1 1 1 1 1 1 1));
-  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 100),
-      BOOST_BINARY(1 1 1 1 1 1 1 1));
-  EXPECT_EQ(BitUtil::TrailingBits(0, 1), 0);
-  EXPECT_EQ(BitUtil::TrailingBits(0, 64), 0);
-  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 0), 0);
-  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 63), 0);
-  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 64), 1LL << 63);
-}
-
-TEST(BitUtil, ByteSwap) {
-  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0)), 0);
-  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0x11223344)), 0x44332211);
-
-  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int32_t>(0)), 0);
-  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int32_t>(0x11223344)), 0x44332211);
-
-  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint64_t>(0)), 0);
-  EXPECT_EQ(
-      BitUtil::ByteSwap(static_cast<uint64_t>(0x1122334455667788)), 0x8877665544332211);
-
-  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int64_t>(0)), 0);
-  EXPECT_EQ(
-      BitUtil::ByteSwap(static_cast<int64_t>(0x1122334455667788)), 0x8877665544332211);
-
-  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int16_t>(0)), 0);
-  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int16_t>(0x1122)), 0x2211);
-
-  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0)), 0);
-  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0x1122)), 0x2211);
-}
-
-TEST(BitUtil, Log2) {
-  EXPECT_EQ(BitUtil::Log2(1), 0);
-  EXPECT_EQ(BitUtil::Log2(2), 1);
-  EXPECT_EQ(BitUtil::Log2(3), 2);
-  EXPECT_EQ(BitUtil::Log2(4), 2);
-  EXPECT_EQ(BitUtil::Log2(5), 3);
-  EXPECT_EQ(BitUtil::Log2(INT_MAX), 31);
-  EXPECT_EQ(BitUtil::Log2(UINT_MAX), 32);
-  EXPECT_EQ(BitUtil::Log2(ULLONG_MAX), 64);
-}
-
-TEST(BitUtil, RoundUpToPowerOf2) {
-  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(7, 8), 8);
-  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(8, 8), 8);
-  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(9, 8), 16);
-}
-
-TEST(BitUtil, RoundDownToPowerOf2) {
-  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(7, 8), 0);
-  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(8, 8), 8);
-  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(9, 8), 8);
-}
-
-TEST(BitUtil, RoundUpDown) {
-  EXPECT_EQ(BitUtil::RoundUpNumBytes(7), 1);
-  EXPECT_EQ(BitUtil::RoundUpNumBytes(8), 1);
-  EXPECT_EQ(BitUtil::RoundUpNumBytes(9), 2);
-  EXPECT_EQ(BitUtil::RoundDownNumBytes(7), 0);
-  EXPECT_EQ(BitUtil::RoundDownNumBytes(8), 1);
-  EXPECT_EQ(BitUtil::RoundDownNumBytes(9), 1);
-
-  EXPECT_EQ(BitUtil::RoundUpNumi32(31), 1);
-  EXPECT_EQ(BitUtil::RoundUpNumi32(32), 1);
-  EXPECT_EQ(BitUtil::RoundUpNumi32(33), 2);
-  EXPECT_EQ(BitUtil::RoundDownNumi32(31), 0);
-  EXPECT_EQ(BitUtil::RoundDownNumi32(32), 1);
-  EXPECT_EQ(BitUtil::RoundDownNumi32(33), 1);
-
-  EXPECT_EQ(BitUtil::RoundUpNumi64(63), 1);
-  EXPECT_EQ(BitUtil::RoundUpNumi64(64), 1);
-  EXPECT_EQ(BitUtil::RoundUpNumi64(65), 2);
-  EXPECT_EQ(BitUtil::RoundDownNumi64(63), 0);
-  EXPECT_EQ(BitUtil::RoundDownNumi64(64), 1);
-  EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1);
-}
-
-void TestZigZag(int32_t v) {
-  uint8_t buffer[BitReader::MAX_VLQ_BYTE_LEN];
-  BitWriter writer(buffer, sizeof(buffer));
-  BitReader reader(buffer, sizeof(buffer));
-  writer.PutZigZagVlqInt(v);
-  int32_t result;
-  EXPECT_TRUE(reader.GetZigZagVlqInt(&result));
-  EXPECT_EQ(v, result);
-}
-
-TEST(BitStreamUtil, ZigZag) {
-  TestZigZag(0);
-  TestZigZag(1);
-  TestZigZag(-1);
-  TestZigZag(std::numeric_limits<int32_t>::max());
-  TestZigZag(-std::numeric_limits<int32_t>::max());
-}
-
-}  // namespace parquet
diff --git a/src/parquet/util/bit-util.h b/src/parquet/util/bit-util.h
deleted file mode 100644
index e315b5f4..00000000
--- a/src/parquet/util/bit-util.h
+++ /dev/null
@@ -1,340 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29
-
-#ifndef PARQUET_UTIL_BIT_UTIL_H
-#define PARQUET_UTIL_BIT_UTIL_H
-
-#if defined(__APPLE__)
-#include <machine/endian.h>
-#elif defined(_WIN32)
-#define __LITTLE_ENDIAN 1
-#else
-#include <endian.h>
-#endif
-
-#if defined(_MSC_VER)
-#define PARQUET_BYTE_SWAP64 _byteswap_uint64
-#define PARQUET_BYTE_SWAP32 _byteswap_ulong
-#else
-#define PARQUET_BYTE_SWAP64 __builtin_bswap64
-#define PARQUET_BYTE_SWAP32 __builtin_bswap32
-#endif
-
-#include <cstdint>
-
-#include "parquet/util/compiler-util.h"
-
-#ifdef PARQUET_USE_SSE
-#include "parquet/util/cpu-info.h"
-#include "parquet/util/sse-util.h"
-#endif
-
-namespace parquet {
-
-#define INIT_BITSET(valid_bits_vector, valid_bits_index)        \
-  int byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
-  int bit_offset_##valid_bits_vector = (valid_bits_index) % 8;  \
-  uint8_t bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector];
-
-#define READ_NEXT_BITSET(valid_bits_vector)                                          \
-  bit_offset_##valid_bits_vector++;                                                  \
-  if (bit_offset_##valid_bits_vector == 8) {                                         \
-    bit_offset_##valid_bits_vector = 0;                                              \
-    byte_offset_##valid_bits_vector++;                                               \
-    bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector]; \
-  }
-
-// TODO(wesm): The source from Impala was depending on boost::make_unsigned
-//
-// We add a partial stub implementation here
-
-template <typename T>
-struct make_unsigned {};
-
-template <>
-struct make_unsigned<int8_t> {
-  typedef uint8_t type;
-};
-
-template <>
-struct make_unsigned<int16_t> {
-  typedef uint16_t type;
-};
-
-template <>
-struct make_unsigned<int32_t> {
-  typedef uint32_t type;
-};
-
-template <>
-struct make_unsigned<int64_t> {
-  typedef uint64_t type;
-};
-
-/// Utility class to do standard bit tricks
-class BitUtil {
- public:
-  /// Returns the ceil of value/divisor
-  static inline int64_t Ceil(int64_t value, int64_t divisor) {
-    return value / divisor + (value % divisor != 0);
-  }
-
-  /// Returns 'value' rounded up to the nearest multiple of 'factor'
-  static inline int64_t RoundUp(int64_t value, int64_t factor) {
-    return (value + (factor - 1)) / factor * factor;
-  }
-
-  /// Returns 'value' rounded down to the nearest multiple of 'factor'
-  static inline int64_t RoundDown(int64_t value, int64_t factor) {
-    return (value / factor) * factor;
-  }
-
-  /// Returns the smallest power of two that contains v. Taken from
-  /// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-  /// TODO: Pick a better name, as it is not clear what happens when the input is
-  /// already a power of two.
-  static inline int64_t NextPowerOfTwo(int64_t v) {
-    --v;
-    v |= v >> 1;
-    v |= v >> 2;
-    v |= v >> 4;
-    v |= v >> 8;
-    v |= v >> 16;
-    v |= v >> 32;
-    ++v;
-    return v;
-  }
-
-  /// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
-  /// a power of two
-  static inline int RoundUpToPowerOf2(int value, int factor) {
-    // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
-    return (value + (factor - 1)) & ~(factor - 1);
-  }
-
-  static inline int RoundDownToPowerOf2(int value, int factor) {
-    // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
-    return value & ~(factor - 1);
-  }
-
-  /// Specialized round up and down functions for frequently used factors,
-  /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64).
-  /// Returns the rounded up number of bytes that fit the number of bits.
-  static inline uint32_t RoundUpNumBytes(uint32_t bits) { return (bits + 7) >> 3; }
-
-  /// Returns the rounded down number of bytes that fit the number of bits.
-  static inline uint32_t RoundDownNumBytes(uint32_t bits) { return bits >> 3; }
-
-  /// Returns the rounded up to 32 multiple. Used for conversions of bits to i32.
-  static inline uint32_t RoundUpNumi32(uint32_t bits) { return (bits + 31) >> 5; }
-
-  /// Returns the rounded up 32 multiple.
-  static inline uint32_t RoundDownNumi32(uint32_t bits) { return bits >> 5; }
-
-  /// Returns the rounded up to 64 multiple. Used for conversions of bits to i64.
-  static inline uint32_t RoundUpNumi64(uint32_t bits) { return (bits + 63) >> 6; }
-
-  /// Returns the rounded down to 64 multiple.
-  static inline uint32_t RoundDownNumi64(uint32_t bits) { return bits >> 6; }
-
-  /// Non hw accelerated pop count.
-  /// TODO: we don't use this in any perf sensitive code paths currently.  There
-  /// might be a much faster way to implement this.
-  static inline int PopcountNoHw(uint64_t x) {
-    int count = 0;
-    for (; x != 0; ++count)
-      x &= x - 1;
-    return count;
-  }
-
-  /// Returns the number of set bits in x
-  static inline int Popcount(uint64_t x) {
-#ifdef PARQUET_USE_SSE
-    if (LIKELY(CpuInfo::IsSupported(CpuInfo::POPCNT))) {
-      return POPCNT_popcnt_u64(x);
-    } else {
-      return PopcountNoHw(x);
-    }
-#else
-    return PopcountNoHw(x);
-#endif
-  }
-
-  // Compute correct population count for various-width signed integers
-  template <typename T>
-  static inline int PopcountSigned(T v) {
-    // Converting to same-width unsigned then extending preserves the bit pattern.
-    return BitUtil::Popcount(static_cast<typename make_unsigned<T>::type>(v));
-  }
-
-  /// Returns the 'num_bits' least-significant bits of 'v'.
-  static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
-    if (UNLIKELY(num_bits == 0)) return 0;
-    if (UNLIKELY(num_bits >= 64)) return v;
-    int n = 64 - num_bits;
-    return (v << n) >> n;
-  }
-
-  /// Returns ceil(log2(x)).
-  /// TODO: this could be faster if we use __builtin_clz.  Fix this if this ever shows up
-  /// in a hot path.
-  static inline int Log2(uint64_t x) {
-    // DCHECK_GT(x, 0);
-    if (x == 1) return 0;
-    // Compute result = ceil(log2(x))
-    //                = floor(log2(x - 1)) + 1, for x > 1
-    // by finding the position of the most significant bit (1-indexed) of x - 1
-    // (floor(log2(n)) = MSB(n) (0-indexed))
-    --x;
-    int result = 1;
-    while (x >>= 1)
-      ++result;
-    return result;
-  }
-
-  /// Swaps the byte order (i.e. endianess)
-  static inline int64_t ByteSwap(int64_t value) { return PARQUET_BYTE_SWAP64(value); }
-  static inline uint64_t ByteSwap(uint64_t value) {
-    return static_cast<uint64_t>(PARQUET_BYTE_SWAP64(value));
-  }
-  static inline int32_t ByteSwap(int32_t value) { return PARQUET_BYTE_SWAP32(value); }
-  static inline uint32_t ByteSwap(uint32_t value) {
-    return static_cast<uint32_t>(PARQUET_BYTE_SWAP32(value));
-  }
-  static inline int16_t ByteSwap(int16_t value) {
-    return (((value >> 8) & 0xff) | ((value & 0xff) << 8));
-  }
-  static inline uint16_t ByteSwap(uint16_t value) {
-    return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
-  }
-
-  /// Write the swapped bytes into dst. Src and st cannot overlap.
-  static inline void ByteSwap(void* dst, const void* src, int len) {
-    switch (len) {
-      case 1:
-        *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
-        return;
-      case 2:
-        *reinterpret_cast<int16_t*>(dst) =
-            ByteSwap(*reinterpret_cast<const int16_t*>(src));
-        return;
-      case 4:
-        *reinterpret_cast<int32_t*>(dst) =
-            ByteSwap(*reinterpret_cast<const int32_t*>(src));
-        return;
-      case 8:
-        *reinterpret_cast<int64_t*>(dst) =
-            ByteSwap(*reinterpret_cast<const int64_t*>(src));
-        return;
-      default:
-        break;
-    }
-
-    uint8_t* d = reinterpret_cast<uint8_t*>(dst);
-    const uint8_t* s = reinterpret_cast<const uint8_t*>(src);
-    for (int i = 0; i < len; ++i) {
-      d[i] = s[len - i - 1];
-    }
-  }
-
-/// Converts to big endian format (if not already in big endian) from the
-/// machine's native endian format.
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-  static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); }
-  static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); }
-  static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); }
-  static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); }
-  static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); }
-  static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); }
-#else
-  static inline int64_t ToBigEndian(int64_t val) { return val; }
-  static inline uint64_t ToBigEndian(uint64_t val) { return val; }
-  static inline int32_t ToBigEndian(int32_t val) { return val; }
-  static inline uint32_t ToBigEndian(uint32_t val) { return val; }
-  static inline int16_t ToBigEndian(int16_t val) { return val; }
-  static inline uint16_t ToBigEndian(uint16_t val) { return val; }
-#endif
-
-/// Converts from big endian format to the machine's native endian format.
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-  static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); }
-  static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); }
-  static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); }
-  static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); }
-  static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); }
-  static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); }
-#else
-  static inline int64_t FromBigEndian(int64_t val) { return val; }
-  static inline uint64_t FromBigEndian(uint64_t val) { return val; }
-  static inline int32_t FromBigEndian(int32_t val) { return val; }
-  static inline uint32_t FromBigEndian(uint32_t val) { return val; }
-  static inline int16_t FromBigEndian(int16_t val) { return val; }
-  static inline uint16_t FromBigEndian(uint16_t val) { return val; }
-#endif
-
-  // Logical right shift for signed integer types
-  // This is needed because the C >> operator does arithmetic right shift
-  // Negative shift amounts lead to undefined behavior
-  template <typename T>
-  static T ShiftRightLogical(T v, int shift) {
-    // Conversion to unsigned ensures most significant bits always filled with 0's
-    return static_cast<typename make_unsigned<T>::type>(v) >> shift;
-  }
-
-  // Get an specific bit of a numeric type
-  template <typename T>
-  static inline int8_t GetBit(T v, int bitpos) {
-    T masked = v & (static_cast<T>(0x1) << bitpos);
-    return static_cast<int8_t>(ShiftRightLogical(masked, bitpos));
-  }
-
-  // Set a specific bit to 1
-  // Behavior when bitpos is negative is undefined
-  template <typename T>
-  static T SetBit(T v, int bitpos) {
-    return v | (static_cast<T>(0x1) << bitpos);
-  }
-
-  static inline bool GetArrayBit(const uint8_t* bits, int i) {
-    return (bits[i / 8] & (1 << (i % 8))) != 0;
-  }
-
-  static inline void SetArrayBit(uint8_t* bits, int i, bool is_set) {
-    bits[i / 8] |= (1 << (i % 8)) * is_set;
-  }
-
-  // Set a specific bit to 0
-  // Behavior when bitpos is negative is undefined
-  template <typename T>
-  static T UnsetBit(T v, int bitpos) {
-    return v & ~(static_cast<T>(0x1) << bitpos);
-  }
-
-  // Returns the minimum number of bits needed to represent the value of 'x'
-  static inline int NumRequiredBits(uint64_t x) {
-    for (int i = 63; i >= 0; --i) {
-      if (x & (UINT64_C(1) << i)) return i + 1;
-    }
-    return 0;
-  }
-};
-
-}  // namespace parquet
-
-#endif  // PARQUET_UTIL_BIT_UTIL_H
diff --git a/src/parquet/util/bpacking.h b/src/parquet/util/bpacking.h
deleted file mode 100644
index aa6fdab6..00000000
--- a/src/parquet/util/bpacking.h
+++ /dev/null
@@ -1,3342 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// This file was modified from its original version for inclusion in parquet-cpp.
-// Original source:
-// https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp
-// The original copyright notice follows.
-
-/**
-*
-* This code is released under the
-* Apache License Version 2.0 http://www.apache.org/licenses/.
-* (c) Daniel Lemire 2013
-*/
-
-#ifndef PARQUET_UTIL_BPACKING_H
-#define PARQUET_UTIL_BPACKING_H
-
-#include <stdexcept>
-
-namespace parquet {
-
-inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) & 1;
-  out++;
-  *out = ((*in) >> 1) & 1;
-  out++;
-  *out = ((*in) >> 2) & 1;
-  out++;
-  *out = ((*in) >> 3) & 1;
-  out++;
-  *out = ((*in) >> 4) & 1;
-  out++;
-  *out = ((*in) >> 5) & 1;
-  out++;
-  *out = ((*in) >> 6) & 1;
-  out++;
-  *out = ((*in) >> 7) & 1;
-  out++;
-  *out = ((*in) >> 8) & 1;
-  out++;
-  *out = ((*in) >> 9) & 1;
-  out++;
-  *out = ((*in) >> 10) & 1;
-  out++;
-  *out = ((*in) >> 11) & 1;
-  out++;
-  *out = ((*in) >> 12) & 1;
-  out++;
-  *out = ((*in) >> 13) & 1;
-  out++;
-  *out = ((*in) >> 14) & 1;
-  out++;
-  *out = ((*in) >> 15) & 1;
-  out++;
-  *out = ((*in) >> 16) & 1;
-  out++;
-  *out = ((*in) >> 17) & 1;
-  out++;
-  *out = ((*in) >> 18) & 1;
-  out++;
-  *out = ((*in) >> 19) & 1;
-  out++;
-  *out = ((*in) >> 20) & 1;
-  out++;
-  *out = ((*in) >> 21) & 1;
-  out++;
-  *out = ((*in) >> 22) & 1;
-  out++;
-  *out = ((*in) >> 23) & 1;
-  out++;
-  *out = ((*in) >> 24) & 1;
-  out++;
-  *out = ((*in) >> 25) & 1;
-  out++;
-  *out = ((*in) >> 26) & 1;
-  out++;
-  *out = ((*in) >> 27) & 1;
-  out++;
-  *out = ((*in) >> 28) & 1;
-  out++;
-  *out = ((*in) >> 29) & 1;
-  out++;
-  *out = ((*in) >> 30) & 1;
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 2);
-  out++;
-  *out = ((*in) >> 4) % (1U << 2);
-  out++;
-  *out = ((*in) >> 6) % (1U << 2);
-  out++;
-  *out = ((*in) >> 8) % (1U << 2);
-  out++;
-  *out = ((*in) >> 10) % (1U << 2);
-  out++;
-  *out = ((*in) >> 12) % (1U << 2);
-  out++;
-  *out = ((*in) >> 14) % (1U << 2);
-  out++;
-  *out = ((*in) >> 16) % (1U << 2);
-  out++;
-  *out = ((*in) >> 18) % (1U << 2);
-  out++;
-  *out = ((*in) >> 20) % (1U << 2);
-  out++;
-  *out = ((*in) >> 22) % (1U << 2);
-  out++;
-  *out = ((*in) >> 24) % (1U << 2);
-  out++;
-  *out = ((*in) >> 26) % (1U << 2);
-  out++;
-  *out = ((*in) >> 28) % (1U << 2);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 2);
-  out++;
-  *out = ((*in) >> 4) % (1U << 2);
-  out++;
-  *out = ((*in) >> 6) % (1U << 2);
-  out++;
-  *out = ((*in) >> 8) % (1U << 2);
-  out++;
-  *out = ((*in) >> 10) % (1U << 2);
-  out++;
-  *out = ((*in) >> 12) % (1U << 2);
-  out++;
-  *out = ((*in) >> 14) % (1U << 2);
-  out++;
-  *out = ((*in) >> 16) % (1U << 2);
-  out++;
-  *out = ((*in) >> 18) % (1U << 2);
-  out++;
-  *out = ((*in) >> 20) % (1U << 2);
-  out++;
-  *out = ((*in) >> 22) % (1U << 2);
-  out++;
-  *out = ((*in) >> 24) % (1U << 2);
-  out++;
-  *out = ((*in) >> 26) % (1U << 2);
-  out++;
-  *out = ((*in) >> 28) % (1U << 2);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 3);
-  out++;
-  *out = ((*in) >> 6) % (1U << 3);
-  out++;
-  *out = ((*in) >> 9) % (1U << 3);
-  out++;
-  *out = ((*in) >> 12) % (1U << 3);
-  out++;
-  *out = ((*in) >> 15) % (1U << 3);
-  out++;
-  *out = ((*in) >> 18) % (1U << 3);
-  out++;
-  *out = ((*in) >> 21) % (1U << 3);
-  out++;
-  *out = ((*in) >> 24) % (1U << 3);
-  out++;
-  *out = ((*in) >> 27) % (1U << 3);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (3 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 3);
-  out++;
-  *out = ((*in) >> 4) % (1U << 3);
-  out++;
-  *out = ((*in) >> 7) % (1U << 3);
-  out++;
-  *out = ((*in) >> 10) % (1U << 3);
-  out++;
-  *out = ((*in) >> 13) % (1U << 3);
-  out++;
-  *out = ((*in) >> 16) % (1U << 3);
-  out++;
-  *out = ((*in) >> 19) % (1U << 3);
-  out++;
-  *out = ((*in) >> 22) % (1U << 3);
-  out++;
-  *out = ((*in) >> 25) % (1U << 3);
-  out++;
-  *out = ((*in) >> 28) % (1U << 3);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (3 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 3);
-  out++;
-  *out = ((*in) >> 5) % (1U << 3);
-  out++;
-  *out = ((*in) >> 8) % (1U << 3);
-  out++;
-  *out = ((*in) >> 11) % (1U << 3);
-  out++;
-  *out = ((*in) >> 14) % (1U << 3);
-  out++;
-  *out = ((*in) >> 17) % (1U << 3);
-  out++;
-  *out = ((*in) >> 20) % (1U << 3);
-  out++;
-  *out = ((*in) >> 23) % (1U << 3);
-  out++;
-  *out = ((*in) >> 26) % (1U << 3);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 4);
-  out++;
-  *out = ((*in) >> 8) % (1U << 4);
-  out++;
-  *out = ((*in) >> 12) % (1U << 4);
-  out++;
-  *out = ((*in) >> 16) % (1U << 4);
-  out++;
-  *out = ((*in) >> 20) % (1U << 4);
-  out++;
-  *out = ((*in) >> 24) % (1U << 4);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 4);
-  out++;
-  *out = ((*in) >> 8) % (1U << 4);
-  out++;
-  *out = ((*in) >> 12) % (1U << 4);
-  out++;
-  *out = ((*in) >> 16) % (1U << 4);
-  out++;
-  *out = ((*in) >> 20) % (1U << 4);
-  out++;
-  *out = ((*in) >> 24) % (1U << 4);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 4);
-  out++;
-  *out = ((*in) >> 8) % (1U << 4);
-  out++;
-  *out = ((*in) >> 12) % (1U << 4);
-  out++;
-  *out = ((*in) >> 16) % (1U << 4);
-  out++;
-  *out = ((*in) >> 20) % (1U << 4);
-  out++;
-  *out = ((*in) >> 24) % (1U << 4);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 4);
-  out++;
-  *out = ((*in) >> 8) % (1U << 4);
-  out++;
-  *out = ((*in) >> 12) % (1U << 4);
-  out++;
-  *out = ((*in) >> 16) % (1U << 4);
-  out++;
-  *out = ((*in) >> 20) % (1U << 4);
-  out++;
-  *out = ((*in) >> 24) % (1U << 4);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 5);
-  out++;
-  *out = ((*in) >> 10) % (1U << 5);
-  out++;
-  *out = ((*in) >> 15) % (1U << 5);
-  out++;
-  *out = ((*in) >> 20) % (1U << 5);
-  out++;
-  *out = ((*in) >> 25) % (1U << 5);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (5 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 5);
-  out++;
-  *out = ((*in) >> 8) % (1U << 5);
-  out++;
-  *out = ((*in) >> 13) % (1U << 5);
-  out++;
-  *out = ((*in) >> 18) % (1U << 5);
-  out++;
-  *out = ((*in) >> 23) % (1U << 5);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (5 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 5);
-  out++;
-  *out = ((*in) >> 6) % (1U << 5);
-  out++;
-  *out = ((*in) >> 11) % (1U << 5);
-  out++;
-  *out = ((*in) >> 16) % (1U << 5);
-  out++;
-  *out = ((*in) >> 21) % (1U << 5);
-  out++;
-  *out = ((*in) >> 26) % (1U << 5);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (5 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 5);
-  out++;
-  *out = ((*in) >> 9) % (1U << 5);
-  out++;
-  *out = ((*in) >> 14) % (1U << 5);
-  out++;
-  *out = ((*in) >> 19) % (1U << 5);
-  out++;
-  *out = ((*in) >> 24) % (1U << 5);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (5 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 5);
-  out++;
-  *out = ((*in) >> 7) % (1U << 5);
-  out++;
-  *out = ((*in) >> 12) % (1U << 5);
-  out++;
-  *out = ((*in) >> 17) % (1U << 5);
-  out++;
-  *out = ((*in) >> 22) % (1U << 5);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 6);
-  out++;
-  *out = ((*in) >> 12) % (1U << 6);
-  out++;
-  *out = ((*in) >> 18) % (1U << 6);
-  out++;
-  *out = ((*in) >> 24) % (1U << 6);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (6 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 6);
-  out++;
-  *out = ((*in) >> 10) % (1U << 6);
-  out++;
-  *out = ((*in) >> 16) % (1U << 6);
-  out++;
-  *out = ((*in) >> 22) % (1U << 6);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (6 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 6);
-  out++;
-  *out = ((*in) >> 8) % (1U << 6);
-  out++;
-  *out = ((*in) >> 14) % (1U << 6);
-  out++;
-  *out = ((*in) >> 20) % (1U << 6);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 6);
-  out++;
-  *out = ((*in) >> 12) % (1U << 6);
-  out++;
-  *out = ((*in) >> 18) % (1U << 6);
-  out++;
-  *out = ((*in) >> 24) % (1U << 6);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (6 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 6);
-  out++;
-  *out = ((*in) >> 10) % (1U << 6);
-  out++;
-  *out = ((*in) >> 16) % (1U << 6);
-  out++;
-  *out = ((*in) >> 22) % (1U << 6);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (6 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 6);
-  out++;
-  *out = ((*in) >> 8) % (1U << 6);
-  out++;
-  *out = ((*in) >> 14) % (1U << 6);
-  out++;
-  *out = ((*in) >> 20) % (1U << 6);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 7);
-  out++;
-  *out = ((*in) >> 7) % (1U << 7);
-  out++;
-  *out = ((*in) >> 14) % (1U << 7);
-  out++;
-  *out = ((*in) >> 21) % (1U << 7);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (7 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 7);
-  out++;
-  *out = ((*in) >> 10) % (1U << 7);
-  out++;
-  *out = ((*in) >> 17) % (1U << 7);
-  out++;
-  *out = ((*in) >> 24) % (1U << 7);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (7 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 7);
-  out++;
-  *out = ((*in) >> 13) % (1U << 7);
-  out++;
-  *out = ((*in) >> 20) % (1U << 7);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (7 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 7);
-  out++;
-  *out = ((*in) >> 9) % (1U << 7);
-  out++;
-  *out = ((*in) >> 16) % (1U << 7);
-  out++;
-  *out = ((*in) >> 23) % (1U << 7);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (7 - 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 7);
-  out++;
-  *out = ((*in) >> 12) % (1U << 7);
-  out++;
-  *out = ((*in) >> 19) % (1U << 7);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (7 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 7);
-  out++;
-  *out = ((*in) >> 8) % (1U << 7);
-  out++;
-  *out = ((*in) >> 15) % (1U << 7);
-  out++;
-  *out = ((*in) >> 22) % (1U << 7);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (7 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 7);
-  out++;
-  *out = ((*in) >> 11) % (1U << 7);
-  out++;
-  *out = ((*in) >> 18) % (1U << 7);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 8);
-  out++;
-  *out = ((*in) >> 16) % (1U << 8);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 8);
-  out++;
-  *out = ((*in) >> 16) % (1U << 8);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 8);
-  out++;
-  *out = ((*in) >> 16) % (1U << 8);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 8);
-  out++;
-  *out = ((*in) >> 16) % (1U << 8);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 8);
-  out++;
-  *out = ((*in) >> 16) % (1U << 8);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 8);
-  out++;
-  *out = ((*in) >> 16) % (1U << 8);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 8);
-  out++;
-  *out = ((*in) >> 16) % (1U << 8);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 8);
-  out++;
-  *out = ((*in) >> 16) % (1U << 8);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 9);
-  out++;
-  *out = ((*in) >> 9) % (1U << 9);
-  out++;
-  *out = ((*in) >> 18) % (1U << 9);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (9 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 9);
-  out++;
-  *out = ((*in) >> 13) % (1U << 9);
-  out++;
-  *out = ((*in) >> 22) % (1U << 9);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (9 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 9);
-  out++;
-  *out = ((*in) >> 17) % (1U << 9);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (9 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 9);
-  out++;
-  *out = ((*in) >> 12) % (1U << 9);
-  out++;
-  *out = ((*in) >> 21) % (1U << 9);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (9 - 7);
-  out++;
-  *out = ((*in) >> 7) % (1U << 9);
-  out++;
-  *out = ((*in) >> 16) % (1U << 9);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (9 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 9);
-  out++;
-  *out = ((*in) >> 11) % (1U << 9);
-  out++;
-  *out = ((*in) >> 20) % (1U << 9);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (9 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 9);
-  out++;
-  *out = ((*in) >> 15) % (1U << 9);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (9 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 9);
-  out++;
-  *out = ((*in) >> 10) % (1U << 9);
-  out++;
-  *out = ((*in) >> 19) % (1U << 9);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (9 - 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 9);
-  out++;
-  *out = ((*in) >> 14) % (1U << 9);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 10);
-  out++;
-  *out = ((*in) >> 20) % (1U << 10);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (10 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 10);
-  out++;
-  *out = ((*in) >> 18) % (1U << 10);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (10 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 10);
-  out++;
-  *out = ((*in) >> 16) % (1U << 10);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (10 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 10);
-  out++;
-  *out = ((*in) >> 14) % (1U << 10);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (10 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 10);
-  out++;
-  *out = ((*in) >> 12) % (1U << 10);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 10);
-  out++;
-  *out = ((*in) >> 20) % (1U << 10);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (10 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 10);
-  out++;
-  *out = ((*in) >> 18) % (1U << 10);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (10 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 10);
-  out++;
-  *out = ((*in) >> 16) % (1U << 10);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (10 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 10);
-  out++;
-  *out = ((*in) >> 14) % (1U << 10);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (10 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 10);
-  out++;
-  *out = ((*in) >> 12) % (1U << 10);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 11);
-  out++;
-  *out = ((*in) >> 11) % (1U << 11);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (11 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 11);
-  out++;
-  *out = ((*in) >> 12) % (1U << 11);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (11 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 11);
-  out++;
-  *out = ((*in) >> 13) % (1U << 11);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (11 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 11);
-  out++;
-  *out = ((*in) >> 14) % (1U << 11);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (11 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 11);
-  out++;
-  *out = ((*in) >> 15) % (1U << 11);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (11 - 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 11);
-  out++;
-  *out = ((*in) >> 16) % (1U << 11);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (11 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 11);
-  out++;
-  *out = ((*in) >> 17) % (1U << 11);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (11 - 7);
-  out++;
-  *out = ((*in) >> 7) % (1U << 11);
-  out++;
-  *out = ((*in) >> 18) % (1U << 11);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (11 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 11);
-  out++;
-  *out = ((*in) >> 19) % (1U << 11);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (11 - 9);
-  out++;
-  *out = ((*in) >> 9) % (1U << 11);
-  out++;
-  *out = ((*in) >> 20) % (1U << 11);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (11 - 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 11);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 12);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (12 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 12);
-  out++;
-  *out = ((*in) >> 16) % (1U << 12);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (12 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 12);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 12);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (12 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 12);
-  out++;
-  *out = ((*in) >> 16) % (1U << 12);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (12 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 12);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 12);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (12 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 12);
-  out++;
-  *out = ((*in) >> 16) % (1U << 12);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (12 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 12);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 12);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (12 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 12);
-  out++;
-  *out = ((*in) >> 16) % (1U << 12);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (12 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 12);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 13);
-  out++;
-  *out = ((*in) >> 13) % (1U << 13);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (13 - 7);
-  out++;
-  *out = ((*in) >> 7) % (1U << 13);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (13 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 13);
-  out++;
-  *out = ((*in) >> 14) % (1U << 13);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (13 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 13);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (13 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 13);
-  out++;
-  *out = ((*in) >> 15) % (1U << 13);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (13 - 9);
-  out++;
-  *out = ((*in) >> 9) % (1U << 13);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (13 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 13);
-  out++;
-  *out = ((*in) >> 16) % (1U << 13);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (13 - 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 13);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (13 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 13);
-  out++;
-  *out = ((*in) >> 17) % (1U << 13);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 11)) << (13 - 11);
-  out++;
-  *out = ((*in) >> 11) % (1U << 13);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (13 - 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 13);
-  out++;
-  *out = ((*in) >> 18) % (1U << 13);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (13 - 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 13);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (13 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 13);
-  out++;
-  *out = ((*in) >> 19);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 14);
-  out++;
-  *out = ((*in) >> 14) % (1U << 14);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (14 - 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 14);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (14 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 14);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (14 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 14);
-  out++;
-  *out = ((*in) >> 16) % (1U << 14);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (14 - 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 14);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (14 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 14);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (14 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 14);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 14);
-  out++;
-  *out = ((*in) >> 14) % (1U << 14);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (14 - 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 14);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (14 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 14);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (14 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 14);
-  out++;
-  *out = ((*in) >> 16) % (1U << 14);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (14 - 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 14);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (14 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 14);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (14 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 14);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 15);
-  out++;
-  *out = ((*in) >> 15) % (1U << 15);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 13)) << (15 - 13);
-  out++;
-  *out = ((*in) >> 13) % (1U << 15);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 11)) << (15 - 11);
-  out++;
-  *out = ((*in) >> 11) % (1U << 15);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (15 - 9);
-  out++;
-  *out = ((*in) >> 9) % (1U << 15);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (15 - 7);
-  out++;
-  *out = ((*in) >> 7) % (1U << 15);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (15 - 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 15);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (15 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 15);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (15 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 15);
-  out++;
-  *out = ((*in) >> 16) % (1U << 15);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (15 - 14);
-  out++;
-  *out = ((*in) >> 14) % (1U << 15);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (15 - 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 15);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (15 - 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 15);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (15 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 15);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (15 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 15);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (15 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 15);
-  out++;
-  *out = ((*in) >> 19);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (15 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 15);
-  out++;
-  *out = ((*in) >> 17);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 17);
-  out++;
-  *out = ((*in) >> 17);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (17 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 17);
-  out++;
-  *out = ((*in) >> 19);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (17 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 17);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (17 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 17);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (17 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 17);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (17 - 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 17);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (17 - 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 17);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (17 - 14);
-  out++;
-  *out = ((*in) >> 14) % (1U << 17);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (17 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (17 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 17);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (17 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 17);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (17 - 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 17);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (17 - 7);
-  out++;
-  *out = ((*in) >> 7) % (1U << 17);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (17 - 9);
-  out++;
-  *out = ((*in) >> 9) % (1U << 17);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 11)) << (17 - 11);
-  out++;
-  *out = ((*in) >> 11) % (1U << 17);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 13)) << (17 - 13);
-  out++;
-  *out = ((*in) >> 13) % (1U << 17);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 15)) << (17 - 15);
-  out++;
-  *out = ((*in) >> 15);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (18 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 18);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (18 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 18);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (18 - 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 18);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (18 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (18 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 18);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (18 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 18);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (18 - 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 18);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (18 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (18 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 18);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (18 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 18);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (18 - 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 18);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (18 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (18 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 18);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (18 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 18);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (18 - 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 18);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (18 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 19);
-  out++;
-  *out = ((*in) >> 19);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (19 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 19);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (19 - 12);
-  out++;
-  *out = ((*in) >> 12) % (1U << 19);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (19 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (19 - 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 19);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 11)) << (19 - 11);
-  out++;
-  *out = ((*in) >> 11) % (1U << 19);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 17)) << (19 - 17);
-  out++;
-  *out = ((*in) >> 17);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (19 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 19);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (19 - 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 19);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (19 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (19 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 19);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (19 - 9);
-  out++;
-  *out = ((*in) >> 9) % (1U << 19);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 15)) << (19 - 15);
-  out++;
-  *out = ((*in) >> 15);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (19 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 19);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (19 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 19);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (19 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (19 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 19);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (19 - 7);
-  out++;
-  *out = ((*in) >> 7) % (1U << 19);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 13)) << (19 - 13);
-  out++;
-  *out = ((*in) >> 13);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (20 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 20);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (20 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (20 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 20);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (20 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (20 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 20);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (20 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (20 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 20);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (20 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (20 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 20);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (20 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (20 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 20);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (20 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (20 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 20);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (20 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (20 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 20);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (20 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 21);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (21 - 10);
-  out++;
-  *out = ((*in) >> 10) % (1U << 21);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (21 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (21 - 9);
-  out++;
-  *out = ((*in) >> 9) % (1U << 21);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 19)) << (21 - 19);
-  out++;
-  *out = ((*in) >> 19);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (21 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 21);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (21 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (21 - 7);
-  out++;
-  *out = ((*in) >> 7) % (1U << 21);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 17)) << (21 - 17);
-  out++;
-  *out = ((*in) >> 17);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (21 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 21);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (21 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (21 - 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 21);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 15)) << (21 - 15);
-  out++;
-  *out = ((*in) >> 15);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (21 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 21);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (21 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (21 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 21);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 13)) << (21 - 13);
-  out++;
-  *out = ((*in) >> 13);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (21 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 21);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (21 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (21 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 21);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 11)) << (21 - 11);
-  out++;
-  *out = ((*in) >> 11);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (22 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (22 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 22);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (22 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (22 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 22);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (22 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (22 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 22);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (22 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (22 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 22);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (22 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (22 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (22 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (22 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 22);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (22 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (22 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 22);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (22 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (22 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 22);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (22 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (22 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 22);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (22 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (22 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 23);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (23 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (23 - 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 23);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 19)) << (23 - 19);
-  out++;
-  *out = ((*in) >> 19);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (23 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (23 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 23);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 15)) << (23 - 15);
-  out++;
-  *out = ((*in) >> 15);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (23 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 23);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (23 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 11)) << (23 - 11);
-  out++;
-  *out = ((*in) >> 11);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (23 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 23);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (23 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (23 - 7);
-  out++;
-  *out = ((*in) >> 7) % (1U << 23);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 21)) << (23 - 21);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (23 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (23 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 23);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 17)) << (23 - 17);
-  out++;
-  *out = ((*in) >> 17);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (23 - 8);
-  out++;
-  *out = ((*in) >> 8) % (1U << 23);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 22)) << (23 - 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 13)) << (23 - 13);
-  out++;
-  *out = ((*in) >> 13);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (23 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 23);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (23 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (23 - 9);
-  out++;
-  *out = ((*in) >> 9);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (24 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (24 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (24 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (24 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (24 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (24 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (24 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (24 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (24 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (24 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (24 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (24 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (24 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (24 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (24 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (24 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 25);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (25 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 11)) << (25 - 11);
-  out++;
-  *out = ((*in) >> 11);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (25 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 25);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 22)) << (25 - 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 15)) << (25 - 15);
-  out++;
-  *out = ((*in) >> 15);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (25 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (25 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 25);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 19)) << (25 - 19);
-  out++;
-  *out = ((*in) >> 19);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (25 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (25 - 5);
-  out++;
-  *out = ((*in) >> 5) % (1U << 25);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 23)) << (25 - 23);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (25 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (25 - 9);
-  out++;
-  *out = ((*in) >> 9);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (25 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 25);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (25 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 13)) << (25 - 13);
-  out++;
-  *out = ((*in) >> 13);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (25 - 6);
-  out++;
-  *out = ((*in) >> 6) % (1U << 25);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (25 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 17)) << (25 - 17);
-  out++;
-  *out = ((*in) >> 17);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (25 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (25 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 25);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 21)) << (25 - 21);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (25 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (25 - 7);
-  out++;
-  *out = ((*in) >> 7);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 26);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (26 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (26 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (26 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (26 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 26);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 22)) << (26 - 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (26 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (26 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (26 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 26);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (26 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (26 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (26 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (26 - 6);
-  out++;
-  *out = ((*in) >> 6);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 26);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (26 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (26 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (26 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (26 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 26);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 22)) << (26 - 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (26 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (26 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (26 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 26);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (26 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (26 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (26 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (26 - 6);
-  out++;
-  *out = ((*in) >> 6);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 27);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 22)) << (27 - 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 17)) << (27 - 17);
-  out++;
-  *out = ((*in) >> 17);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (27 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (27 - 7);
-  out++;
-  *out = ((*in) >> 7);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (27 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 27);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (27 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 19)) << (27 - 19);
-  out++;
-  *out = ((*in) >> 19);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (27 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (27 - 9);
-  out++;
-  *out = ((*in) >> 9);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (27 - 4);
-  out++;
-  *out = ((*in) >> 4) % (1U << 27);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 26)) << (27 - 26);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 21)) << (27 - 21);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (27 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 11)) << (27 - 11);
-  out++;
-  *out = ((*in) >> 11);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (27 - 6);
-  out++;
-  *out = ((*in) >> 6);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (27 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 27);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 23)) << (27 - 23);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (27 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 13)) << (27 - 13);
-  out++;
-  *out = ((*in) >> 13);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (27 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (27 - 3);
-  out++;
-  *out = ((*in) >> 3) % (1U << 27);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 25)) << (27 - 25);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (27 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 15)) << (27 - 15);
-  out++;
-  *out = ((*in) >> 15);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (27 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (27 - 5);
-  out++;
-  *out = ((*in) >> 5);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 28);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (28 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (28 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (28 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (28 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (28 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (28 - 4);
-  out++;
-  *out = ((*in) >> 4);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 28);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (28 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (28 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (28 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (28 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (28 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (28 - 4);
-  out++;
-  *out = ((*in) >> 4);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 28);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (28 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (28 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (28 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (28 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (28 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (28 - 4);
-  out++;
-  *out = ((*in) >> 4);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 28);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (28 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (28 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (28 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (28 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (28 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (28 - 4);
-  out++;
-  *out = ((*in) >> 4);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 29);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 26)) << (29 - 26);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 23)) << (29 - 23);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (29 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 17)) << (29 - 17);
-  out++;
-  *out = ((*in) >> 17);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (29 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 11)) << (29 - 11);
-  out++;
-  *out = ((*in) >> 11);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (29 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (29 - 5);
-  out++;
-  *out = ((*in) >> 5);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (29 - 2);
-  out++;
-  *out = ((*in) >> 2) % (1U << 29);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 28)) << (29 - 28);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 25)) << (29 - 25);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 22)) << (29 - 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 19)) << (29 - 19);
-  out++;
-  *out = ((*in) >> 19);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (29 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 13)) << (29 - 13);
-  out++;
-  *out = ((*in) >> 13);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (29 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (29 - 7);
-  out++;
-  *out = ((*in) >> 7);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (29 - 4);
-  out++;
-  *out = ((*in) >> 4);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (29 - 1);
-  out++;
-  *out = ((*in) >> 1) % (1U << 29);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 27)) << (29 - 27);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (29 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 21)) << (29 - 21);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (29 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 15)) << (29 - 15);
-  out++;
-  *out = ((*in) >> 15);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (29 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (29 - 9);
-  out++;
-  *out = ((*in) >> 9);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (29 - 6);
-  out++;
-  *out = ((*in) >> 6);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (29 - 3);
-  out++;
-  *out = ((*in) >> 3);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 30);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 28)) << (30 - 28);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 26)) << (30 - 26);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (30 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 22)) << (30 - 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (30 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (30 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (30 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (30 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (30 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (30 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (30 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (30 - 6);
-  out++;
-  *out = ((*in) >> 6);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (30 - 4);
-  out++;
-  *out = ((*in) >> 4);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (30 - 2);
-  out++;
-  *out = ((*in) >> 2);
-  ++in;
-  out++;
-  *out = ((*in) >> 0) % (1U << 30);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 28)) << (30 - 28);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 26)) << (30 - 26);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (30 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 22)) << (30 - 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (30 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (30 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (30 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (30 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (30 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (30 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (30 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (30 - 6);
-  out++;
-  *out = ((*in) >> 6);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (30 - 4);
-  out++;
-  *out = ((*in) >> 4);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (30 - 2);
-  out++;
-  *out = ((*in) >> 2);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0) % (1U << 31);
-  out++;
-  *out = ((*in) >> 31);
-  ++in;
-  *out |= ((*in) % (1U << 30)) << (31 - 30);
-  out++;
-  *out = ((*in) >> 30);
-  ++in;
-  *out |= ((*in) % (1U << 29)) << (31 - 29);
-  out++;
-  *out = ((*in) >> 29);
-  ++in;
-  *out |= ((*in) % (1U << 28)) << (31 - 28);
-  out++;
-  *out = ((*in) >> 28);
-  ++in;
-  *out |= ((*in) % (1U << 27)) << (31 - 27);
-  out++;
-  *out = ((*in) >> 27);
-  ++in;
-  *out |= ((*in) % (1U << 26)) << (31 - 26);
-  out++;
-  *out = ((*in) >> 26);
-  ++in;
-  *out |= ((*in) % (1U << 25)) << (31 - 25);
-  out++;
-  *out = ((*in) >> 25);
-  ++in;
-  *out |= ((*in) % (1U << 24)) << (31 - 24);
-  out++;
-  *out = ((*in) >> 24);
-  ++in;
-  *out |= ((*in) % (1U << 23)) << (31 - 23);
-  out++;
-  *out = ((*in) >> 23);
-  ++in;
-  *out |= ((*in) % (1U << 22)) << (31 - 22);
-  out++;
-  *out = ((*in) >> 22);
-  ++in;
-  *out |= ((*in) % (1U << 21)) << (31 - 21);
-  out++;
-  *out = ((*in) >> 21);
-  ++in;
-  *out |= ((*in) % (1U << 20)) << (31 - 20);
-  out++;
-  *out = ((*in) >> 20);
-  ++in;
-  *out |= ((*in) % (1U << 19)) << (31 - 19);
-  out++;
-  *out = ((*in) >> 19);
-  ++in;
-  *out |= ((*in) % (1U << 18)) << (31 - 18);
-  out++;
-  *out = ((*in) >> 18);
-  ++in;
-  *out |= ((*in) % (1U << 17)) << (31 - 17);
-  out++;
-  *out = ((*in) >> 17);
-  ++in;
-  *out |= ((*in) % (1U << 16)) << (31 - 16);
-  out++;
-  *out = ((*in) >> 16);
-  ++in;
-  *out |= ((*in) % (1U << 15)) << (31 - 15);
-  out++;
-  *out = ((*in) >> 15);
-  ++in;
-  *out |= ((*in) % (1U << 14)) << (31 - 14);
-  out++;
-  *out = ((*in) >> 14);
-  ++in;
-  *out |= ((*in) % (1U << 13)) << (31 - 13);
-  out++;
-  *out = ((*in) >> 13);
-  ++in;
-  *out |= ((*in) % (1U << 12)) << (31 - 12);
-  out++;
-  *out = ((*in) >> 12);
-  ++in;
-  *out |= ((*in) % (1U << 11)) << (31 - 11);
-  out++;
-  *out = ((*in) >> 11);
-  ++in;
-  *out |= ((*in) % (1U << 10)) << (31 - 10);
-  out++;
-  *out = ((*in) >> 10);
-  ++in;
-  *out |= ((*in) % (1U << 9)) << (31 - 9);
-  out++;
-  *out = ((*in) >> 9);
-  ++in;
-  *out |= ((*in) % (1U << 8)) << (31 - 8);
-  out++;
-  *out = ((*in) >> 8);
-  ++in;
-  *out |= ((*in) % (1U << 7)) << (31 - 7);
-  out++;
-  *out = ((*in) >> 7);
-  ++in;
-  *out |= ((*in) % (1U << 6)) << (31 - 6);
-  out++;
-  *out = ((*in) >> 6);
-  ++in;
-  *out |= ((*in) % (1U << 5)) << (31 - 5);
-  out++;
-  *out = ((*in) >> 5);
-  ++in;
-  *out |= ((*in) % (1U << 4)) << (31 - 4);
-  out++;
-  *out = ((*in) >> 4);
-  ++in;
-  *out |= ((*in) % (1U << 3)) << (31 - 3);
-  out++;
-  *out = ((*in) >> 3);
-  ++in;
-  *out |= ((*in) % (1U << 2)) << (31 - 2);
-  out++;
-  *out = ((*in) >> 2);
-  ++in;
-  *out |= ((*in) % (1U << 1)) << (31 - 1);
-  out++;
-  *out = ((*in) >> 1);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) {
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-  *out = ((*in) >> 0);
-  ++in;
-  out++;
-
-  return in;
-}
-
-inline const uint32_t* nullunpacker32(const uint32_t* in, uint32_t* out) {
-  for (int k = 0; k < 32; ++k) {
-    out[k] = 0;
-  }
-  return in;
-}
-
-inline int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
-  batch_size = batch_size / 32 * 32;
-  int num_loops = batch_size / 32;
-
-  switch (num_bits) {
-    case 0:
-      for (int i = 0; i < num_loops; ++i)
-        in = nullunpacker32(in, out + i * 32);
-      break;
-    case 1:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack1_32(in, out + i * 32);
-      break;
-    case 2:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack2_32(in, out + i * 32);
-      break;
-    case 3:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack3_32(in, out + i * 32);
-      break;
-    case 4:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack4_32(in, out + i * 32);
-      break;
-    case 5:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack5_32(in, out + i * 32);
-      break;
-    case 6:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack6_32(in, out + i * 32);
-      break;
-    case 7:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack7_32(in, out + i * 32);
-      break;
-    case 8:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack8_32(in, out + i * 32);
-      break;
-    case 9:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack9_32(in, out + i * 32);
-      break;
-    case 10:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack10_32(in, out + i * 32);
-      break;
-    case 11:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack11_32(in, out + i * 32);
-      break;
-    case 12:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack12_32(in, out + i * 32);
-      break;
-    case 13:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack13_32(in, out + i * 32);
-      break;
-    case 14:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack14_32(in, out + i * 32);
-      break;
-    case 15:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack15_32(in, out + i * 32);
-      break;
-    case 16:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack16_32(in, out + i * 32);
-      break;
-    case 17:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack17_32(in, out + i * 32);
-      break;
-    case 18:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack18_32(in, out + i * 32);
-      break;
-    case 19:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack19_32(in, out + i * 32);
-      break;
-    case 20:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack20_32(in, out + i * 32);
-      break;
-    case 21:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack21_32(in, out + i * 32);
-      break;
-    case 22:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack22_32(in, out + i * 32);
-      break;
-    case 23:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack23_32(in, out + i * 32);
-      break;
-    case 24:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack24_32(in, out + i * 32);
-      break;
-    case 25:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack25_32(in, out + i * 32);
-      break;
-    case 26:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack26_32(in, out + i * 32);
-      break;
-    case 27:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack27_32(in, out + i * 32);
-      break;
-    case 28:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack28_32(in, out + i * 32);
-      break;
-    case 29:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack29_32(in, out + i * 32);
-      break;
-    case 30:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack30_32(in, out + i * 32);
-      break;
-    case 31:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack31_32(in, out + i * 32);
-      break;
-    case 32:
-      for (int i = 0; i < num_loops; ++i)
-        in = unpack32_32(in, out + i * 32);
-      break;
-    default:
-      throw std::runtime_error("Unsupported num_bits");
-  }
-
-  return batch_size;
-}
-
-};  // namespace parquet
-
-#endif  // PARQUET_UTIL_BPACKING_H
diff --git a/src/parquet/util/compiler-util.h b/src/parquet/util/compiler-util.h
deleted file mode 100644
index 9d0b2650..00000000
--- a/src/parquet/util/compiler-util.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PARQUET_UTIL_COMPILER_UTIL_H
-#define PARQUET_UTIL_COMPILER_UTIL_H
-
-// Compiler hint that this branch is likely or unlikely to
-// be taken. Take from the "What all programmers should know
-// about memory" paper.
-// example: if (LIKELY(size > 0)) { ... }
-// example: if (UNLIKELY(!status.ok())) { ... }
-#ifdef LIKELY
-#undef LIKELY
-#endif
-
-#ifdef UNLIKELY
-#undef UNLIKELY
-#endif
-
-#ifdef _MSC_VER
-#define LIKELY(expr) expr
-#define UNLIKELY(expr) expr
-#else
-#define LIKELY(expr) __builtin_expect(!!(expr), 1)
-#define UNLIKELY(expr) __builtin_expect(!!(expr), 0)
-#endif
-
-#define PREFETCH(addr) __builtin_prefetch(addr)
-
-// macros to disable padding
-// these macros are portable across different compilers and platforms
-//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355]
-#if defined(_MSC_VER)
-#define MANUALLY_ALIGNED_STRUCT(alignment) \
-  __pragma(pack(1));                       \
-  struct __declspec(align(alignment))
-#define STRUCT_END(name, size) \
-  __pragma(pack());            \
-  static_assert(sizeof(name) == size, "compiler breaks packing rules")
-#elif defined(__GNUC__) || defined(__clang__)
-#define MANUALLY_ALIGNED_STRUCT(alignment) \
-  _Pragma("pack(1)") struct __attribute__((aligned(alignment)))
-#define STRUCT_END(name, size) \
-  _Pragma("pack()") static_assert(sizeof(name) == size, "compiler breaks packing rules")
-#else
-#error Unknown compiler, please define structure alignment macros
-#endif
-
-#endif  // PARQUET_UTIL_COMPILER_UTIL_H
diff --git a/src/parquet/util/cpu-info.cc b/src/parquet/util/cpu-info.cc
deleted file mode 100644
index c6c143ea..00000000
--- a/src/parquet/util/cpu-info.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
-// functions needed for parquet-cpp
-
-#include "parquet/util/cpu-info.h"
-
-#ifdef __APPLE__
-#include <sys/sysctl.h>
-#endif
-
-#include <stdlib.h>
-#include <string.h>
-
-#ifndef _MSC_VER
-#include <unistd.h>
-#endif
-
-#include <boost/algorithm/string.hpp>
-
-#include <algorithm>
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <mutex>
-#include <sstream>
-#include <string>
-
-#include "parquet/exception.h"
-#include "parquet/util/logging.h"
-
-using boost::algorithm::contains;
-using boost::algorithm::trim;
-using std::max;
-using std::string;
-
-namespace parquet {
-
-bool CpuInfo::initialized_ = false;
-int64_t CpuInfo::hardware_flags_ = 0;
-int64_t CpuInfo::original_hardware_flags_;
-int64_t CpuInfo::cache_sizes_[L3_CACHE + 1];
-int64_t CpuInfo::cycles_per_ms_;
-int CpuInfo::num_cores_ = 1;
-string CpuInfo::model_name_ = "unknown";  // NOLINT
-static std::mutex cpuinfo_mutex;
-
-static struct {
-  string name;
-  int64_t flag;
-} flag_mappings[] = {
-    {"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1}, {"sse4_2", CpuInfo::SSE4_2},
-    {"popcnt", CpuInfo::POPCNT},
-};
-static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
-
-// Helper function to parse for hardware flags.
-// values contains a list of space-seperated flags.  check to see if the flags we
-// care about are present.
-// Returns a bitmap of flags.
-int64_t ParseCPUFlags(const string& values) {
-  int64_t flags = 0;
-  for (int i = 0; i < num_flags; ++i) {
-    if (contains(values, flag_mappings[i].name)) { flags |= flag_mappings[i].flag; }
-  }
-  return flags;
-}
-
-void CpuInfo::Init() {
-  std::lock_guard<std::mutex> cpuinfo_lock(cpuinfo_mutex);
-
-  if (initialized()) { return; }
-
-  string line;
-  string name;
-  string value;
-
-  float max_mhz = 0;
-  int num_cores = 0;
-
-  memset(&cache_sizes_, 0, sizeof(cache_sizes_));
-
-  // Read from /proc/cpuinfo
-  std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
-  while (cpuinfo) {
-    getline(cpuinfo, line);
-    size_t colon = line.find(':');
-    if (colon != string::npos) {
-      name = line.substr(0, colon - 1);
-      value = line.substr(colon + 1, string::npos);
-      trim(name);
-      trim(value);
-      if (name.compare("flags") == 0) {
-        hardware_flags_ |= ParseCPUFlags(value);
-      } else if (name.compare("cpu MHz") == 0) {
-        // Every core will report a different speed.  We'll take the max, assuming
-        // that when impala is running, the core will not be in a lower power state.
-        // TODO: is there a more robust way to do this, such as
-        // Window's QueryPerformanceFrequency()
-        float mhz = static_cast<float>(atof(value.c_str()));
-        max_mhz = max(mhz, max_mhz);
-      } else if (name.compare("processor") == 0) {
-        ++num_cores;
-      } else if (name.compare("model name") == 0) {
-        model_name_ = value;
-      }
-    }
-  }
-  if (cpuinfo.is_open()) cpuinfo.close();
-
-#ifdef __APPLE__
-  // On Mac OS X use sysctl() to get the cache sizes
-  size_t len = 0;
-  sysctlbyname("hw.cachesize", NULL, &len, NULL, 0);
-  uint64_t* data = static_cast<uint64_t*>(malloc(len));
-  sysctlbyname("hw.cachesize", data, &len, NULL, 0);
-  DCHECK(len / sizeof(uint64_t) >= 3);
-  for (size_t i = 0; i < 3; ++i) {
-    cache_sizes_[i] = data[i];
-  }
-#else
-#ifndef _SC_LEVEL1_DCACHE_SIZE
-  // Provide reasonable default values if no info
-  cache_sizes_[0] = 32 * 1024;    // Level 1: 32k
-  cache_sizes_[1] = 256 * 1024;   // Level 2: 256k
-  cache_sizes_[2] = 3072 * 1024;  // Level 3: 3M
-#else
-  // Call sysconf to query for the cache sizes
-  cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
-  cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
-  cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
-#endif
-#endif
-
-  if (max_mhz != 0) {
-    cycles_per_ms_ = static_cast<int64_t>(max_mhz) * 1000;
-  } else {
-    cycles_per_ms_ = 1000000;
-  }
-  original_hardware_flags_ = hardware_flags_;
-
-  if (num_cores > 0) {
-    num_cores_ = num_cores;
-  } else {
-    num_cores_ = 1;
-  }
-
-  initialized_ = true;
-}
-
-void CpuInfo::VerifyCpuRequirements() {
-  if (!CpuInfo::IsSupported(CpuInfo::SSSE3)) {
-    throw ParquetException("CPU does not support the Supplemental SSE3 instruction set");
-  }
-}
-
-void CpuInfo::EnableFeature(int64_t flag, bool enable) {
-  DCHECK(initialized_);
-  if (!enable) {
-    hardware_flags_ &= ~flag;
-  } else {
-    // Can't turn something on that can't be supported
-    DCHECK((original_hardware_flags_ & flag) != 0);
-    hardware_flags_ |= flag;
-  }
-}
-
-int64_t CpuInfo::hardware_flags() {
-  DCHECK(initialized_);
-  return hardware_flags_;
-}
-
-int64_t CpuInfo::CacheSize(CacheLevel level) {
-  DCHECK(initialized_);
-  return cache_sizes_[level];
-}
-
-int64_t CpuInfo::cycles_per_ms() {
-  DCHECK(initialized_);
-  return cycles_per_ms_;
-}
-
-int CpuInfo::num_cores() {
-  DCHECK(initialized_);
-  return num_cores_;
-}
-
-std::string CpuInfo::model_name() {
-  DCHECK(initialized_);
-  return model_name_;
-}
-
-}  // namespace parquet
diff --git a/src/parquet/util/cpu-info.h b/src/parquet/util/cpu-info.h
deleted file mode 100644
index 4d7cd273..00000000
--- a/src/parquet/util/cpu-info.h
+++ /dev/null
@@ -1,92 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
-// functions needed for parquet-cpp
-
-#ifndef PARQUET_UTIL_CPU_INFO_H
-#define PARQUET_UTIL_CPU_INFO_H
-
-#include <cstdint>
-#include <string>
-
-#include "parquet/util/visibility.h"
-
-namespace parquet {
-
-/// CpuInfo is an interface to query for cpu information at runtime.  The caller can
-/// ask for the sizes of the caches and what hardware features are supported.
-/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and
-/// /sys/devices)
-class PARQUET_EXPORT CpuInfo {
- public:
-  static const int64_t SSSE3 = (1 << 1);
-  static const int64_t SSE4_1 = (1 << 2);
-  static const int64_t SSE4_2 = (1 << 3);
-  static const int64_t POPCNT = (1 << 4);
-
-  /// Cache enums for L1 (data), L2 and L3
-  enum CacheLevel {
-    L1_CACHE = 0,
-    L2_CACHE = 1,
-    L3_CACHE = 2,
-  };
-
-  /// Initialize CpuInfo.
-  static void Init();
-
-  /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error
-  /// and terminate.
-  static void VerifyCpuRequirements();
-
-  /// Returns all the flags for this cpu
-  static int64_t hardware_flags();
-
-  /// Returns whether of not the cpu supports this flag
-  inline static bool IsSupported(int64_t flag) { return (hardware_flags_ & flag) != 0; }
-
-  /// Toggle a hardware feature on and off.  It is not valid to turn on a feature
-  /// that the underlying hardware cannot support. This is useful for testing.
-  static void EnableFeature(int64_t flag, bool enable);
-
-  /// Returns the size of the cache in KB at this cache level
-  static int64_t CacheSize(CacheLevel level);
-
-  /// Returns the number of cpu cycles per millisecond
-  static int64_t cycles_per_ms();
-
-  /// Returns the number of cores (including hyper-threaded) on this machine.
-  static int num_cores();
-
-  /// Returns the model name of the cpu (e.g. Intel i7-2600)
-  static std::string model_name();
-
-  static bool initialized() { return initialized_; }
-
- private:
-  static bool initialized_;
-  static int64_t hardware_flags_;
-  static int64_t original_hardware_flags_;
-  static int64_t cache_sizes_[L3_CACHE + 1];
-  static int64_t cycles_per_ms_;
-  static int num_cores_;
-  static std::string model_name_;  // NOLINT
-};
-
-}  // namespace parquet
-
-#endif  // PARQUET_UTIL_CPU_INFO_H
diff --git a/src/parquet/util/hash-util.h b/src/parquet/util/hash-util.h
deleted file mode 100644
index 28a523d7..00000000
--- a/src/parquet/util/hash-util.h
+++ /dev/null
@@ -1,258 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-02-22
-
-#ifndef PARQUET_UTIL_HASH_UTIL_H
-#define PARQUET_UTIL_HASH_UTIL_H
-
-#include <cstdint>
-
-#include "parquet/util/compiler-util.h"
-#include "parquet/util/cpu-info.h"
-#include "parquet/util/logging.h"
-#include "parquet/util/sse-util.h"
-
-namespace parquet {
-
-/// Utility class to compute hash values.
-class HashUtil {
- public:
-  /// Compute the Crc32 hash for data using SSE4 instructions.  The input hash
-  /// parameter is the current hash/seed value.
-  /// This should only be called if SSE is supported.
-  /// This is ~4x faster than Fnv/Boost Hash.
-  /// TODO: crc32 hashes with different seeds do not result in different hash functions.
-  /// The resulting hashes are correlated.
-  /// TODO: update this to also use SSE4_crc32_u64 and SSE4_crc32_u16 where appropriate.
-  static uint32_t CrcHash(const void* data, int32_t bytes, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
-    uint32_t words = bytes / sizeof(uint32_t);
-    bytes = bytes % sizeof(uint32_t);
-
-    const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
-    while (words--) {
-      hash = SSE4_crc32_u32(hash, *p);
-      ++p;
-    }
-
-    const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
-    while (bytes--) {
-      hash = SSE4_crc32_u8(hash, *s);
-      ++s;
-    }
-
-    // The lower half of the CRC hash has has poor uniformity, so swap the halves
-    // for anyone who only uses the first several bits of the hash.
-    hash = (hash << 16) | (hash >> 16);
-    return hash;
-  }
-
-  /// CrcHash() specialized for 1-byte data
-  static inline uint32_t CrcHash1(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
-    const uint8_t* s = reinterpret_cast<const uint8_t*>(v);
-    hash = SSE4_crc32_u8(hash, *s);
-    hash = (hash << 16) | (hash >> 16);
-    return hash;
-  }
-
-  /// CrcHash() specialized for 2-byte data
-  static inline uint32_t CrcHash2(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
-    const uint16_t* s = reinterpret_cast<const uint16_t*>(v);
-    hash = SSE4_crc32_u16(hash, *s);
-    hash = (hash << 16) | (hash >> 16);
-    return hash;
-  }
-
-  /// CrcHash() specialized for 4-byte data
-  static inline uint32_t CrcHash4(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
-    const uint32_t* p = reinterpret_cast<const uint32_t*>(v);
-    hash = SSE4_crc32_u32(hash, *p);
-    hash = (hash << 16) | (hash >> 16);
-    return hash;
-  }
-
-  /// CrcHash() specialized for 8-byte data
-  static inline uint32_t CrcHash8(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
-    const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
-    hash = SSE4_crc32_u64(hash, *p);
-    hash = (hash << 16) | (hash >> 16);
-    return hash;
-  }
-
-  /// CrcHash() specialized for 12-byte data
-  static inline uint32_t CrcHash12(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
-    const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
-    hash = SSE4_crc32_u64(hash, *p);
-    ++p;
-    hash = SSE4_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
-    hash = (hash << 16) | (hash >> 16);
-    return hash;
-  }
-
-  /// CrcHash() specialized for 16-byte data
-  static inline uint32_t CrcHash16(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
-    const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
-    hash = SSE4_crc32_u64(hash, *p);
-    ++p;
-    hash = SSE4_crc32_u64(hash, *p);
-    hash = (hash << 16) | (hash >> 16);
-    return hash;
-  }
-
-  static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995;
-  static const int MURMUR_R = 47;
-
-  /// Murmur2 hash implementation returning 64-bit hashes.
-  static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) {
-    uint64_t h = seed ^ (len * MURMUR_PRIME);
-
-    const uint64_t* data = reinterpret_cast<const uint64_t*>(input);
-    const uint64_t* end = data + (len / sizeof(uint64_t));
-
-    while (data != end) {
-      uint64_t k = *data++;
-      k *= MURMUR_PRIME;
-      k ^= k >> MURMUR_R;
-      k *= MURMUR_PRIME;
-      h ^= k;
-      h *= MURMUR_PRIME;
-    }
-
-    const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data);
-    switch (len & 7) {
-      case 7:
-        h ^= uint64_t(data2[6]) << 48;
-      case 6:
-        h ^= uint64_t(data2[5]) << 40;
-      case 5:
-        h ^= uint64_t(data2[4]) << 32;
-      case 4:
-        h ^= uint64_t(data2[3]) << 24;
-      case 3:
-        h ^= uint64_t(data2[2]) << 16;
-      case 2:
-        h ^= uint64_t(data2[1]) << 8;
-      case 1:
-        h ^= uint64_t(data2[0]);
-        h *= MURMUR_PRIME;
-    }
-
-    h ^= h >> MURMUR_R;
-    h *= MURMUR_PRIME;
-    h ^= h >> MURMUR_R;
-    return h;
-  }
-
-  /// default values recommended by http://isthe.com/chongo/tech/comp/fnv/
-  static const uint32_t FNV_PRIME = 0x01000193;  //   16777619
-  static const uint32_t FNV_SEED = 0x811C9DC5;   // 2166136261
-  static const uint64_t FNV64_PRIME = 1099511628211UL;
-  static const uint64_t FNV64_SEED = 14695981039346656037UL;
-
-  /// Implementation of the Fowler-Noll-Vo hash function. This is not as performant
-  /// as boost's hash on int types (2x slower) but has bit entropy.
-  /// For ints, boost just returns the value of the int which can be pathological.
-  /// For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000
-  /// is taken on the hash, all values will collide to the same bucket.
-  /// For string values, Fnv is slightly faster than boost.
-  /// IMPORTANT: FNV hash suffers from poor diffusion of the least significant bit,
-  /// which can lead to poor results when input bytes are duplicated.
-  /// See FnvHash64to32() for how this can be mitigated.
-  static uint64_t FnvHash64(const void* data, int32_t bytes, uint64_t hash) {
-    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
-    while (bytes--) {
-      hash = (*ptr ^ hash) * FNV64_PRIME;
-      ++ptr;
-    }
-    return hash;
-  }
-
-  /// Return a 32-bit hash computed by invoking FNV-64 and folding the result to 32-bits.
-  /// This technique is recommended instead of FNV-32 since the LSB of an FNV hash is the
-  /// XOR of the LSBs of its input bytes, leading to poor results for duplicate inputs.
-  /// The input seed 'hash' is duplicated so the top half of the seed is not all zero.
-  /// Data length must be at least 1 byte: zero-length data should be handled separately,
-  /// for example using CombineHash with a unique constant value to avoid returning the
-  /// hash argument. Zero-length data gives terrible results: the initial hash value is
-  /// xored with itself cancelling all bits.
-  static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) {
-    // IMPALA-2270: this function should never be used for zero-byte inputs.
-    DCHECK_GT(bytes, 0);
-    uint64_t hash_u64 = hash | ((uint64_t)hash << 32);
-    hash_u64 = FnvHash64(data, bytes, hash_u64);
-    return (hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF);
-  }
-
-  /// Computes the hash value for data.  Will call either CrcHash or MurmurHash
-  /// depending on hardware capabilities.
-  /// Seed values for different steps of the query execution should use different seeds
-  /// to prevent accidental key collisions. (See IMPALA-219 for more details).
-  static uint32_t Hash(const void* data, int32_t bytes, uint32_t seed) {
-#ifdef PARQUET_USE_SSE
-    if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSE4_2))) {
-      return CrcHash(data, bytes, seed);
-    } else {
-      return MurmurHash2_64(data, bytes, seed);
-    }
-#else
-    return static_cast<uint32_t>(MurmurHash2_64(data, bytes, seed));
-#endif
-  }
-
-  /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
-  static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9;
-
-  /// Combine hashes 'value' and 'seed' to get a new hash value.  Similar to
-  /// boost::hash_combine(), but for uint32_t. This function should be used with a
-  /// constant first argument to update the hash value for zero-length values such as
-  /// NULL, boolean, and empty strings.
-  static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) {
-    return seed ^ (HASH_COMBINE_SEED + value + (seed << 6) + (seed >> 2));
-  }
-
-  // Get 32 more bits of randomness from a 32-bit hash:
-  static inline uint32_t Rehash32to32(const uint32_t hash) {
-    // Constants generated by uuidgen(1) with the -r flag
-    static const uint64_t m = 0x7850f11ec6d14889ull, a = 0x6773610597ca4c63ull;
-    // This is strongly universal hashing following Dietzfelbinger's "Universal hashing
-    // and k-wise independent random variables via integer arithmetic without primes". As
-    // such, for any two distinct uint32_t's hash1 and hash2, the probability (over the
-    // randomness of the constants) that any subset of bit positions of
-    // Rehash32to32(hash1) is equal to the same subset of bit positions
-    // Rehash32to32(hash2) is minimal.
-    return (static_cast<uint64_t>(hash) * m + a) >> 32;
-  }
-
-  static inline uint64_t Rehash32to64(const uint32_t hash) {
-    static const uint64_t m1 = 0x47b6137a44974d91ull, m2 = 0x8824ad5ba2b7289cull,
-                          a1 = 0x705495c62df1424aull, a2 = 0x9efc49475c6bfb31ull;
-    const uint64_t hash1 = (static_cast<uint64_t>(hash) * m1 + a1) >> 32;
-    const uint64_t hash2 = (static_cast<uint64_t>(hash) * m2 + a2) >> 32;
-    return hash1 | (hash2 << 32);
-  }
-};
-
-}  // namespace parquet
-
-#endif  // PARQUET_UTIL_HASH_UTIL_H
diff --git a/src/parquet/util/logging.h b/src/parquet/util/logging.h
index 7ec5a8f5..e2c7abb1 100644
--- a/src/parquet/util/logging.h
+++ b/src/parquet/util/logging.h
@@ -18,109 +18,6 @@
 #ifndef PARQUET_UTIL_LOGGING_H
 #define PARQUET_UTIL_LOGGING_H
 
-#include <iostream>
-
-namespace parquet {
-
-// Stubbed versions of macros defined in glog/logging.h, intended for
-// environments where glog headers aren't available.
-//
-// Add more as needed.
-
-// Log levels. LOG ignores them, so their values are abitrary.
-
-#define PARQUET_INFO 0
-#define PARQUET_WARNING 1
-#define PARQUET_ERROR 2
-#define PARQUET_FATAL 3
-
-#define PARQUET_LOG_INTERNAL(level) ::parquet::internal::CerrLog(level)
-#define PARQUET_LOG(level) PARQUET_LOG_INTERNAL(PARQUET_##level)
-#define PARQUET_IGNORE_EXPR(expr) ((void)(expr));
-
-#define PARQUET_CHECK(condition) \
-  (condition) ? 0 : PARQUET_LOG(FATAL) << "Check failed: " #condition " "
-
-#ifdef NDEBUG
-#define PARQUET_DFATAL PARQUET_WARNING
-
-#define DCHECK(condition)        \
-  PARQUET_IGNORE_EXPR(condition) \
-  while (false)                  \
-  ::parquet::internal::NullLog()
-#define DCHECK_EQ(val1, val2) \
-  PARQUET_IGNORE_EXPR(val1)   \
-  while (false)               \
-  ::parquet::internal::NullLog()
-#define DCHECK_NE(val1, val2) \
-  PARQUET_IGNORE_EXPR(val1)   \
-  while (false)               \
-  ::parquet::internal::NullLog()
-#define DCHECK_LE(val1, val2) \
-  PARQUET_IGNORE_EXPR(val1)   \
-  while (false)               \
-  ::parquet::internal::NullLog()
-#define DCHECK_LT(val1, val2) \
-  PARQUET_IGNORE_EXPR(val1)   \
-  while (false)               \
-  ::parquet::internal::NullLog()
-#define DCHECK_GE(val1, val2) \
-  PARQUET_IGNORE_EXPR(val1)   \
-  while (false)               \
-  ::parquet::internal::NullLog()
-#define DCHECK_GT(val1, val2) \
-  PARQUET_IGNORE_EXPR(val1)   \
-  while (false)               \
-  ::parquet::internal::NullLog()
-
-#else
-#define PARQUET_DFATAL PARQUET_FATAL
-
-#define DCHECK(condition) PARQUET_CHECK(condition)
-#define DCHECK_EQ(val1, val2) PARQUET_CHECK((val1) == (val2))
-#define DCHECK_NE(val1, val2) PARQUET_CHECK((val1) != (val2))
-#define DCHECK_LE(val1, val2) PARQUET_CHECK((val1) <= (val2))
-#define DCHECK_LT(val1, val2) PARQUET_CHECK((val1) < (val2))
-#define DCHECK_GE(val1, val2) PARQUET_CHECK((val1) >= (val2))
-#define DCHECK_GT(val1, val2) PARQUET_CHECK((val1) > (val2))
-
-#endif  // NDEBUG
-
-namespace internal {
-
-class NullLog {
- public:
-  template <class T>
-  NullLog& operator<<(const T& t) {
-    return *this;
-  }
-};
-
-class CerrLog {
- public:
-  CerrLog(int severity)  // NOLINT(runtime/explicit)
-      : severity_(severity),
-        has_logged_(false) {}
-
-  ~CerrLog() {
-    if (has_logged_) { std::cerr << std::endl; }
-    if (severity_ == PARQUET_FATAL) { exit(1); }
-  }
-
-  template <class T>
-  CerrLog& operator<<(const T& t) {
-    has_logged_ = true;
-    std::cerr << t;
-    return *this;
-  }
-
- private:
-  const int severity_;
-  bool has_logged_;
-};
-
-}  // namespace internal
-
-}  // namespace parquet
+#include "arrow/util/logging.h"
 
 #endif  // PARQUET_UTIL_LOGGING_H
diff --git a/src/parquet/util/memory.cc b/src/parquet/util/memory.cc
index 96131821..39c43fb3 100644
--- a/src/parquet/util/memory.cc
+++ b/src/parquet/util/memory.cc
@@ -24,10 +24,10 @@
 #include <utility>
 
 #include "arrow/status.h"
+#include "arrow/util/bit-util.h"
 
 #include "parquet/exception.h"
 #include "parquet/types.h"
-#include "parquet/util/bit-util.h"
 #include "parquet/util/logging.h"
 
 using arrow::MemoryPool;
@@ -123,7 +123,7 @@ template <bool CHECK_LIMIT_FIRST>
 uint8_t* ChunkedAllocator::Allocate(int size) {
   if (size == 0) return NULL;
 
-  int64_t num_bytes = BitUtil::RoundUp(size, 8);
+  int64_t num_bytes = ::arrow::BitUtil::RoundUp(size, 8);
   if (current_chunk_idx_ == -1 ||
       num_bytes + chunks_[current_chunk_idx_].allocated_bytes >
           chunks_[current_chunk_idx_].size) {
diff --git a/src/parquet/util/memory.h b/src/parquet/util/memory.h
index 2e7eb0f7..d73a3de7 100644
--- a/src/parquet/util/memory.h
+++ b/src/parquet/util/memory.h
@@ -80,6 +80,8 @@ class PARQUET_EXPORT Vector {
   void Swap(Vector<T>& v);
   inline T& operator[](int64_t i) const { return data_[i]; }
 
+  const T* data() const { return data_; }
+
  private:
   std::unique_ptr<PoolBuffer> buffer_;
   int64_t size_;
diff --git a/src/parquet/util/rle-encoding.h b/src/parquet/util/rle-encoding.h
deleted file mode 100644
index d9eb6a29..00000000
--- a/src/parquet/util/rle-encoding.h
+++ /dev/null
@@ -1,599 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29
-
-#ifndef PARQUET_UTIL_RLE_ENCODING_H
-#define PARQUET_UTIL_RLE_ENCODING_H
-
-#include <algorithm>
-#include <math.h>
-
-#include "parquet/util/bit-stream-utils.inline.h"
-#include "parquet/util/bit-util.h"
-#include "parquet/util/compiler-util.h"
-#include "parquet/util/memory.h"
-
-namespace parquet {
-
-/// Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
-/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
-/// (literal encoding).
-/// For both types of runs, there is a byte-aligned indicator which encodes the length
-/// of the run and the type of the run.
-/// This encoding has the benefit that when there aren't any long enough runs, values
-/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
-/// the run length are byte aligned. This allows for very efficient decoding
-/// implementations.
-/// The encoding is:
-///    encoded-block := run*
-///    run := literal-run | repeated-run
-///    literal-run := literal-indicator < literal bytes >
-///    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
-///    literal-indicator := varint_encode( number_of_groups << 1 | 1)
-///    repeated-indicator := varint_encode( number_of_repetitions << 1 )
-//
-/// Each run is preceded by a varint. The varint's least significant bit is
-/// used to indicate whether the run is a literal run or a repeated run. The rest
-/// of the varint is used to determine the length of the run (eg how many times the
-/// value repeats).
-//
-/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
-/// in groups of 8), so that no matter the bit-width of the value, the sequence will end
-/// on a byte boundary without padding.
-/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
-/// the actual number of encoded ints. (This means that the total number of encoded values
-/// can not be determined from the encoded data, since the number of values in the last
-/// group may not be a multiple of 8). For the last group of literal runs, we pad
-/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
-/// without the need for additional checks.
-//
-/// There is a break-even point when it is more storage efficient to do run length
-/// encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
-/// for both the repeated encoding or the literal encoding.  This value can always
-/// be computed based on the bit-width.
-/// TODO: think about how to use this for strings.  The bit packing isn't quite the same.
-//
-/// Examples with bit-width 1 (eg encoding booleans):
-/// ----------------------------------------
-/// 100 1s followed by 100 0s:
-/// <varint(100 << 1)> <1, padded to 1 byte>  <varint(100 << 1)> <0, padded to 1 byte>
-///  - (total 4 bytes)
-//
-/// alternating 1s and 0s (200 total):
-/// 200 ints = 25 groups of 8
-/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
-/// (total 26 bytes, 1 byte overhead)
-//
-
-/// Decoder class for RLE encoded data.
-class RleDecoder {
- public:
-  /// Create a decoder object. buffer/buffer_len is the decoded data.
-  /// bit_width is the width of each value (before encoding).
-  RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
-      : bit_reader_(buffer, buffer_len),
-        bit_width_(bit_width),
-        current_value_(0),
-        repeat_count_(0),
-        literal_count_(0) {
-    DCHECK_GE(bit_width_, 0);
-    DCHECK_LE(bit_width_, 64);
-  }
-
-  RleDecoder() : bit_width_(-1) {}
-
-  void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
-    DCHECK_GE(bit_width, 0);
-    DCHECK_LE(bit_width, 64);
-    bit_reader_.Reset(buffer, buffer_len);
-    bit_width_ = bit_width;
-    current_value_ = 0;
-    repeat_count_ = 0;
-    literal_count_ = 0;
-  }
-
-  /// Gets the next value.  Returns false if there are no more.
-  template <typename T>
-  bool Get(T* val);
-
-  /// Gets a batch of values.  Returns the number of decoded elements.
-  template <typename T>
-  int GetBatch(T* values, int batch_size);
-
-  /// Like GetBatch but the values are then decoded using the provided dictionary
-  template <typename T>
-  int GetBatchWithDict(const Vector<T>& dictionary, T* values, int batch_size);
-
-  /// Like GetBatchWithDict but add spacing for null entries
-  template <typename T>
-  int GetBatchWithDictSpaced(const Vector<T>& dictionary, T* values, int batch_size,
-      int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset);
-
- protected:
-  BitReader bit_reader_;
-  /// Number of bits needed to encode the value. Must be between 0 and 64.
-  int bit_width_;
-  uint64_t current_value_;
-  uint32_t repeat_count_;
-  uint32_t literal_count_;
-
- private:
-  /// Fills literal_count_ and repeat_count_ with next values. Returns false if there
-  /// are no more.
-  template <typename T>
-  bool NextCounts();
-};
-
-/// Class to incrementally build the rle data.   This class does not allocate any memory.
-/// The encoding has two modes: encoding repeated runs and literal runs.
-/// If the run is sufficiently short, it is more efficient to encode as a literal run.
-/// This class does so by buffering 8 values at a time.  If they are not all the same
-/// they are added to the literal run.  If they are the same, they are added to the
-/// repeated run.  When we switch modes, the previous run is flushed out.
-class RleEncoder {
- public:
-  /// buffer/buffer_len: preallocated output buffer.
-  /// bit_width: max number of bits for value.
-  /// TODO: consider adding a min_repeated_run_length so the caller can control
-  /// when values should be encoded as repeated runs.  Currently this is derived
-  /// based on the bit_width, which can determine a storage optimal choice.
-  /// TODO: allow 0 bit_width (and have dict encoder use it)
-  RleEncoder(uint8_t* buffer, int buffer_len, int bit_width)
-      : bit_width_(bit_width), bit_writer_(buffer, buffer_len) {
-    DCHECK_GE(bit_width_, 0);
-    DCHECK_LE(bit_width_, 64);
-    max_run_byte_size_ = MinBufferSize(bit_width);
-    DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough.";
-    Clear();
-  }
-
-  /// Returns the minimum buffer size needed to use the encoder for 'bit_width'
-  /// This is the maximum length of a single run for 'bit_width'.
-  /// It is not valid to pass a buffer less than this length.
-  static int MinBufferSize(int bit_width) {
-    /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
-    int max_literal_run_size =
-        1 + static_cast<int>(BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8));
-    /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value.
-    int max_repeated_run_size =
-        BitReader::MAX_VLQ_BYTE_LEN + static_cast<int>(BitUtil::Ceil(bit_width, 8));
-    return std::max(max_literal_run_size, max_repeated_run_size);
-  }
-
-  /// Returns the maximum byte size it could take to encode 'num_values'.
-  static int MaxBufferSize(int bit_width, int num_values) {
-    // For a bit_width > 1, the worst case is the repetition of "literal run of length 8
-    // and then a repeated run of length 8".
-    // 8 values per smallest run, 8 bits per byte
-    // int bytes_per_run = BitUtil::Ceil(bit_width * 8, 8);
-    int bytes_per_run = bit_width;
-    int num_runs = static_cast<int>(BitUtil::Ceil(num_values, 8));
-    int literal_max_size = num_runs + num_runs * bytes_per_run;
-
-    // In the very worst case scenario, the data is a concatenation of repeated
-    // runs of 8 values. Repeated run has a 1 byte varint followed by the
-    // bit-packed repeated value
-    int min_repeated_run_size = 1 + static_cast<int>(BitUtil::Ceil(bit_width, 8));
-    int repeated_max_size =
-        static_cast<int>(BitUtil::Ceil(num_values, 8)) * min_repeated_run_size;
-
-    return std::max(literal_max_size, repeated_max_size);
-  }
-
-  /// Encode value.  Returns true if the value fits in buffer, false otherwise.
-  /// This value must be representable with bit_width_ bits.
-  bool Put(uint64_t value);
-
-  /// Flushes any pending values to the underlying buffer.
-  /// Returns the total number of bytes written
-  int Flush();
-
-  /// Resets all the state in the encoder.
-  void Clear();
-
-  /// Returns pointer to underlying buffer
-  uint8_t* buffer() { return bit_writer_.buffer(); }
-  int32_t len() { return bit_writer_.bytes_written(); }
-
- private:
-  /// Flushes any buffered values.  If this is part of a repeated run, this is largely
-  /// a no-op.
-  /// If it is part of a literal run, this will call FlushLiteralRun, which writes
-  /// out the buffered literal values.
-  /// If 'done' is true, the current run would be written even if it would normally
-  /// have been buffered more.  This should only be called at the end, when the
-  /// encoder has received all values even if it would normally continue to be
-  /// buffered.
-  void FlushBufferedValues(bool done);
-
-  /// Flushes literal values to the underlying buffer.  If update_indicator_byte,
-  /// then the current literal run is complete and the indicator byte is updated.
-  void FlushLiteralRun(bool update_indicator_byte);
-
-  /// Flushes a repeated run to the underlying buffer.
-  void FlushRepeatedRun();
-
-  /// Checks and sets buffer_full_. This must be called after flushing a run to
-  /// make sure there are enough bytes remaining to encode the next run.
-  void CheckBufferFull();
-
-  /// The maximum number of values in a single literal run
-  /// (number of groups encodable by a 1-byte indicator * 8)
-  static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8;
-
-  /// Number of bits needed to encode the value. Must be between 0 and 64.
-  const int bit_width_;
-
-  /// Underlying buffer.
-  BitWriter bit_writer_;
-
-  /// If true, the buffer is full and subsequent Put()'s will fail.
-  bool buffer_full_;
-
-  /// The maximum byte size a single run can take.
-  int max_run_byte_size_;
-
-  /// We need to buffer at most 8 values for literals.  This happens when the
-  /// bit_width is 1 (so 8 values fit in one byte).
-  /// TODO: generalize this to other bit widths
-  int64_t buffered_values_[8];
-
-  /// Number of values in buffered_values_
-  int num_buffered_values_;
-
-  /// The current (also last) value that was written and the count of how
-  /// many times in a row that value has been seen.  This is maintained even
-  /// if we are in a literal run.  If the repeat_count_ get high enough, we switch
-  /// to encoding repeated runs.
-  uint64_t current_value_;
-  int repeat_count_;
-
-  /// Number of literals in the current run.  This does not include the literals
-  /// that might be in buffered_values_.  Only after we've got a group big enough
-  /// can we decide if they should part of the literal_count_ or repeat_count_
-  int literal_count_;
-
-  /// Pointer to a byte in the underlying buffer that stores the indicator byte.
-  /// This is reserved as soon as we need a literal run but the value is written
-  /// when the literal run is complete.
-  uint8_t* literal_indicator_byte_;
-};
-
-template <typename T>
-inline bool RleDecoder::Get(T* val) {
-  return GetBatch(val, 1) == 1;
-}
-
-template <typename T>
-inline int RleDecoder::GetBatch(T* values, int batch_size) {
-  DCHECK_GE(bit_width_, 0);
-  int values_read = 0;
-
-  while (values_read < batch_size) {
-    if (repeat_count_ > 0) {
-      int repeat_batch =
-          std::min(batch_size - values_read, static_cast<int>(repeat_count_));
-      std::fill(values + values_read, values + values_read + repeat_batch,
-          static_cast<T>(current_value_));
-      repeat_count_ -= repeat_batch;
-      values_read += repeat_batch;
-    } else if (literal_count_ > 0) {
-      int literal_batch =
-          std::min(batch_size - values_read, static_cast<int>(literal_count_));
-      int actual_read =
-          bit_reader_.GetBatch(bit_width_, values + values_read, literal_batch);
-      DCHECK_EQ(actual_read, literal_batch);
-      literal_count_ -= literal_batch;
-      values_read += literal_batch;
-    } else {
-      if (!NextCounts<T>()) return values_read;
-    }
-  }
-
-  return values_read;
-}
-
-template <typename T>
-inline int RleDecoder::GetBatchWithDict(
-    const Vector<T>& dictionary, T* values, int batch_size) {
-  DCHECK_GE(bit_width_, 0);
-  int values_read = 0;
-
-  while (values_read < batch_size) {
-    if (repeat_count_ > 0) {
-      int repeat_batch =
-          std::min(batch_size - values_read, static_cast<int>(repeat_count_));
-      std::fill(values + values_read, values + values_read + repeat_batch,
-          dictionary[current_value_]);
-      repeat_count_ -= repeat_batch;
-      values_read += repeat_batch;
-    } else if (literal_count_ > 0) {
-      int literal_batch =
-          std::min(batch_size - values_read, static_cast<int>(literal_count_));
-
-      const int buffer_size = 1024;
-      int indices[buffer_size];
-      literal_batch = std::min(literal_batch, buffer_size);
-      int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch);
-      DCHECK_EQ(actual_read, literal_batch);
-      for (int i = 0; i < literal_batch; ++i) {
-        values[values_read + i] = dictionary[indices[i]];
-      }
-      literal_count_ -= literal_batch;
-      values_read += literal_batch;
-    } else {
-      if (!NextCounts<T>()) return values_read;
-    }
-  }
-
-  return values_read;
-}
-
-template <typename T>
-inline int RleDecoder::GetBatchWithDictSpaced(const Vector<T>& dictionary, T* values,
-    int batch_size, int null_count, const uint8_t* valid_bits,
-    int64_t valid_bits_offset) {
-  DCHECK_GE(bit_width_, 0);
-  int values_read = 0;
-  int remaining_nulls = null_count;
-  INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));
-
-  while (values_read < batch_size) {
-    bool is_valid = (bitset_valid_bits & (1 << bit_offset_valid_bits)) != 0;
-    READ_NEXT_BITSET(valid_bits);
-
-    if (is_valid) {
-      if ((repeat_count_ == 0) && (literal_count_ == 0)) {
-        if (!NextCounts<T>()) return values_read;
-      }
-      if (repeat_count_ > 0) {
-        T value = dictionary[current_value_];
-        // The current index is already valid, we don't need to check that again
-        int repeat_batch = 1;
-        repeat_count_--;
-
-        while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
-          if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
-            repeat_count_--;
-          } else {
-            remaining_nulls--;
-          }
-          repeat_batch++;
-
-          READ_NEXT_BITSET(valid_bits);
-        }
-        std::fill(values + values_read, values + values_read + repeat_batch, value);
-        values_read += repeat_batch;
-      } else if (literal_count_ > 0) {
-        int literal_batch = std::min(
-            batch_size - values_read - remaining_nulls, static_cast<int>(literal_count_));
-
-        // Decode the literals
-        constexpr int kBufferSize = 1024;
-        int indices[kBufferSize];
-        literal_batch = std::min(literal_batch, kBufferSize);
-        int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch);
-        DCHECK_EQ(actual_read, literal_batch);
-
-        int skipped = 0;
-        int literals_read = 1;
-        values[values_read] = dictionary[indices[0]];
-
-        // Read the first bitset to the end
-        while (literals_read < literal_batch) {
-          if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
-            values[values_read + literals_read + skipped] =
-                dictionary[indices[literals_read]];
-            literals_read++;
-          } else {
-            skipped++;
-          }
-
-          READ_NEXT_BITSET(valid_bits);
-        }
-        literal_count_ -= literal_batch;
-        values_read += literal_batch + skipped;
-        remaining_nulls -= skipped;
-      }
-    } else {
-      values_read++;
-      remaining_nulls--;
-    }
-  }
-
-  return values_read;
-}
-
-template <typename T>
-bool RleDecoder::NextCounts() {
-  // Read the next run's indicator int, it could be a literal or repeated run.
-  // The int is encoded as a vlq-encoded value.
-  int32_t indicator_value = 0;
-  bool result = bit_reader_.GetVlqInt(&indicator_value);
-  if (!result) return false;
-
-  // lsb indicates if it is a literal run or repeated run
-  bool is_literal = indicator_value & 1;
-  if (is_literal) {
-    literal_count_ = (indicator_value >> 1) * 8;
-  } else {
-    repeat_count_ = indicator_value >> 1;
-    bool result =
-        bit_reader_.GetAligned<T>(static_cast<int>(BitUtil::Ceil(bit_width_, 8)),
-            reinterpret_cast<T*>(&current_value_));
-    DCHECK(result);
-  }
-  return true;
-}
-
-/// This function buffers input values 8 at a time.  After seeing all 8 values,
-/// it decides whether they should be encoded as a literal or repeated run.
-inline bool RleEncoder::Put(uint64_t value) {
-  DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_));
-  if (UNLIKELY(buffer_full_)) return false;
-
-  if (LIKELY(current_value_ == value)) {
-    ++repeat_count_;
-    if (repeat_count_ > 8) {
-      // This is just a continuation of the current run, no need to buffer the
-      // values.
-      // Note that this is the fast path for long repeated runs.
-      return true;
-    }
-  } else {
-    if (repeat_count_ >= 8) {
-      // We had a run that was long enough but it has ended.  Flush the
-      // current repeated run.
-      DCHECK_EQ(literal_count_, 0);
-      FlushRepeatedRun();
-    }
-    repeat_count_ = 1;
-    current_value_ = value;
-  }
-
-  buffered_values_[num_buffered_values_] = value;
-  if (++num_buffered_values_ == 8) {
-    DCHECK_EQ(literal_count_ % 8, 0);
-    FlushBufferedValues(false);
-  }
-  return true;
-}
-
-inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) {
-  if (literal_indicator_byte_ == NULL) {
-    // The literal indicator byte has not been reserved yet, get one now.
-    literal_indicator_byte_ = bit_writer_.GetNextBytePtr();
-    DCHECK(literal_indicator_byte_ != NULL);
-  }
-
-  // Write all the buffered values as bit packed literals
-  for (int i = 0; i < num_buffered_values_; ++i) {
-    bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_);
-    DCHECK(success) << "There is a bug in using CheckBufferFull()";
-  }
-  num_buffered_values_ = 0;
-
-  if (update_indicator_byte) {
-    // At this point we need to write the indicator byte for the literal run.
-    // We only reserve one byte, to allow for streaming writes of literal values.
-    // The logic makes sure we flush literal runs often enough to not overrun
-    // the 1 byte.
-    DCHECK_EQ(literal_count_ % 8, 0);
-    int num_groups = literal_count_ / 8;
-    int32_t indicator_value = (num_groups << 1) | 1;
-    DCHECK_EQ(indicator_value & 0xFFFFFF00, 0);
-    *literal_indicator_byte_ = indicator_value;
-    literal_indicator_byte_ = NULL;
-    literal_count_ = 0;
-    CheckBufferFull();
-  }
-}
-
-inline void RleEncoder::FlushRepeatedRun() {
-  DCHECK_GT(repeat_count_, 0);
-  bool result = true;
-  // The lsb of 0 indicates this is a repeated run
-  int32_t indicator_value = repeat_count_ << 1 | 0;
-  result &= bit_writer_.PutVlqInt(indicator_value);
-  result &= bit_writer_.PutAligned(
-      current_value_, static_cast<int>(BitUtil::Ceil(bit_width_, 8)));
-  DCHECK(result);
-  num_buffered_values_ = 0;
-  repeat_count_ = 0;
-  CheckBufferFull();
-}
-
-/// Flush the values that have been buffered.  At this point we decide whether
-/// we need to switch between the run types or continue the current one.
-inline void RleEncoder::FlushBufferedValues(bool done) {
-  if (repeat_count_ >= 8) {
-    // Clear the buffered values.  They are part of the repeated run now and we
-    // don't want to flush them out as literals.
-    num_buffered_values_ = 0;
-    if (literal_count_ != 0) {
-      // There was a current literal run.  All the values in it have been flushed
-      // but we still need to update the indicator byte.
-      DCHECK_EQ(literal_count_ % 8, 0);
-      DCHECK_EQ(repeat_count_, 8);
-      FlushLiteralRun(true);
-    }
-    DCHECK_EQ(literal_count_, 0);
-    return;
-  }
-
-  literal_count_ += num_buffered_values_;
-  DCHECK_EQ(literal_count_ % 8, 0);
-  int num_groups = literal_count_ / 8;
-  if (num_groups + 1 >= (1 << 6)) {
-    // We need to start a new literal run because the indicator byte we've reserved
-    // cannot store more values.
-    DCHECK(literal_indicator_byte_ != NULL);
-    FlushLiteralRun(true);
-  } else {
-    FlushLiteralRun(done);
-  }
-  repeat_count_ = 0;
-}
-
-inline int RleEncoder::Flush() {
-  if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) {
-    bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ ||
-                                                 num_buffered_values_ == 0);
-    // There is something pending, figure out if it's a repeated or literal run
-    if (repeat_count_ > 0 && all_repeat) {
-      FlushRepeatedRun();
-    } else {
-      DCHECK_EQ(literal_count_ % 8, 0);
-      // Buffer the last group of literals to 8 by padding with 0s.
-      for (; num_buffered_values_ != 0 && num_buffered_values_ < 8;
-           ++num_buffered_values_) {
-        buffered_values_[num_buffered_values_] = 0;
-      }
-      literal_count_ += num_buffered_values_;
-      FlushLiteralRun(true);
-      repeat_count_ = 0;
-    }
-  }
-  bit_writer_.Flush();
-  DCHECK_EQ(num_buffered_values_, 0);
-  DCHECK_EQ(literal_count_, 0);
-  DCHECK_EQ(repeat_count_, 0);
-
-  return bit_writer_.bytes_written();
-}
-
-inline void RleEncoder::CheckBufferFull() {
-  int bytes_written = bit_writer_.bytes_written();
-  if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) {
-    buffer_full_ = true;
-  }
-}
-
-inline void RleEncoder::Clear() {
-  buffer_full_ = false;
-  current_value_ = 0;
-  repeat_count_ = 0;
-  num_buffered_values_ = 0;
-  literal_count_ = 0;
-  literal_indicator_byte_ = NULL;
-  bit_writer_.Clear();
-}
-
-}  // namespace parquet
-
-#endif  // PARQUET_UTIL_RLE_ENCODING_H
diff --git a/src/parquet/util/rle-test.cc b/src/parquet/util/rle-test.cc
deleted file mode 100644
index 4aa96c3e..00000000
--- a/src/parquet/util/rle-test.cc
+++ /dev/null
@@ -1,460 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29
-
-#include <gtest/gtest.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <boost/utility.hpp>
-
-#include <cstdint>
-#include <iostream>
-#include <random>
-#include <vector>
-
-#include "parquet/util/bit-stream-utils.inline.h"
-#include "parquet/util/rle-encoding.h"
-
-using std::vector;
-
-namespace parquet {
-
-const int MAX_WIDTH = 32;
-
-TEST(BitArray, TestBool) {
-  const int len = 8;
-  uint8_t buffer[len];
-
-  BitWriter writer(buffer, len);
-
-  // Write alternating 0's and 1's
-  for (int i = 0; i < 8; ++i) {
-    bool result = writer.PutValue(i % 2, 1);
-    EXPECT_TRUE(result);
-  }
-  writer.Flush();
-  EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0));
-
-  // Write 00110011
-  for (int i = 0; i < 8; ++i) {
-    bool result = false;
-    switch (i) {
-      case 0:
-      case 1:
-      case 4:
-      case 5:
-        result = writer.PutValue(false, 1);
-        break;
-      default:
-        result = writer.PutValue(true, 1);
-        break;
-    }
-    EXPECT_TRUE(result);
-  }
-  writer.Flush();
-
-  // Validate the exact bit value
-  EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0));
-  EXPECT_EQ((int)buffer[1], BOOST_BINARY(1 1 0 0 1 1 0 0));
-
-  // Use the reader and validate
-  BitReader reader(buffer, len);
-  for (int i = 0; i < 8; ++i) {
-    bool val = false;
-    bool result = reader.GetValue(1, &val);
-    EXPECT_TRUE(result);
-    EXPECT_EQ(val, (i % 2) != 0);
-  }
-
-  for (int i = 0; i < 8; ++i) {
-    bool val = false;
-    bool result = reader.GetValue(1, &val);
-    EXPECT_TRUE(result);
-    switch (i) {
-      case 0:
-      case 1:
-      case 4:
-      case 5:
-        EXPECT_EQ(val, false);
-        break;
-      default:
-        EXPECT_EQ(val, true);
-        break;
-    }
-  }
-}
-
-// Writes 'num_vals' values with width 'bit_width' and reads them back.
-void TestBitArrayValues(int bit_width, int num_vals) {
-  int len = static_cast<int>(BitUtil::Ceil(bit_width * num_vals, 8));
-  EXPECT_TRUE(len > 0);
-  const uint64_t mod = bit_width == 64 ? 1 : 1LL << bit_width;
-
-  std::vector<uint8_t> buffer(len);
-  BitWriter writer(buffer.data(), len);
-  for (int i = 0; i < num_vals; ++i) {
-    bool result = writer.PutValue(i % mod, bit_width);
-    EXPECT_TRUE(result);
-  }
-  writer.Flush();
-  EXPECT_EQ(writer.bytes_written(), len);
-
-  BitReader reader(buffer.data(), len);
-  for (int i = 0; i < num_vals; ++i) {
-    int64_t val = 0;
-    bool result = reader.GetValue(bit_width, &val);
-    EXPECT_TRUE(result);
-    EXPECT_EQ(val, i % mod);
-  }
-  EXPECT_EQ(reader.bytes_left(), 0);
-}
-
-TEST(BitArray, TestValues) {
-  for (int width = 1; width <= MAX_WIDTH; ++width) {
-    TestBitArrayValues(width, 1);
-    TestBitArrayValues(width, 2);
-    // Don't write too many values
-    TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096);
-    TestBitArrayValues(width, 1024);
-  }
-}
-
-// Test some mixed values
-TEST(BitArray, TestMixed) {
-  const int len = 1024;
-  uint8_t buffer[len];
-  bool parity = true;
-
-  BitWriter writer(buffer, len);
-  for (int i = 0; i < len; ++i) {
-    bool result;
-    if (i % 2 == 0) {
-      result = writer.PutValue(parity, 1);
-      parity = !parity;
-    } else {
-      result = writer.PutValue(i, 10);
-    }
-    EXPECT_TRUE(result);
-  }
-  writer.Flush();
-
-  parity = true;
-  BitReader reader(buffer, len);
-  for (int i = 0; i < len; ++i) {
-    bool result;
-    if (i % 2 == 0) {
-      bool val;
-      result = reader.GetValue(1, &val);
-      EXPECT_EQ(val, parity);
-      parity = !parity;
-    } else {
-      int val;
-      result = reader.GetValue(10, &val);
-      EXPECT_EQ(val, i);
-    }
-    EXPECT_TRUE(result);
-  }
-}
-
-// Validates encoding of values by encoding and decoding them.  If
-// expected_encoding != NULL, also validates that the encoded buffer is
-// exactly 'expected_encoding'.
-// if expected_len is not -1, it will validate the encoded size is correct.
-void ValidateRle(const vector<int>& values, int bit_width, uint8_t* expected_encoding,
-    int expected_len) {
-  const int len = 64 * 1024;
-  uint8_t buffer[len];
-  EXPECT_LE(expected_len, len);
-
-  RleEncoder encoder(buffer, len, bit_width);
-  for (size_t i = 0; i < values.size(); ++i) {
-    bool result = encoder.Put(values[i]);
-    EXPECT_TRUE(result);
-  }
-  int encoded_len = encoder.Flush();
-
-  if (expected_len != -1) { EXPECT_EQ(encoded_len, expected_len); }
-  if (expected_encoding != NULL) {
-    EXPECT_TRUE(memcmp(buffer, expected_encoding, expected_len) == 0);
-  }
-
-  // Verify read
-  {
-    RleDecoder decoder(buffer, len, bit_width);
-    for (size_t i = 0; i < values.size(); ++i) {
-      uint64_t val;
-      bool result = decoder.Get(&val);
-      EXPECT_TRUE(result);
-      EXPECT_EQ(values[i], val);
-    }
-  }
-
-  // Verify batch read
-  {
-    RleDecoder decoder(buffer, len, bit_width);
-    vector<int> values_read(values.size());
-    ASSERT_EQ(values.size(),
-        decoder.GetBatch(values_read.data(), static_cast<int>(values.size())));
-    EXPECT_EQ(values, values_read);
-  }
-}
-
-// A version of ValidateRle that round-trips the values and returns false if
-// the returned values are not all the same
-bool CheckRoundTrip(const vector<int>& values, int bit_width) {
-  const int len = 64 * 1024;
-  uint8_t buffer[len];
-  RleEncoder encoder(buffer, len, bit_width);
-  for (size_t i = 0; i < values.size(); ++i) {
-    bool result = encoder.Put(values[i]);
-    if (!result) { return false; }
-  }
-  int encoded_len = encoder.Flush();
-  int out = 0;
-
-  {
-    RleDecoder decoder(buffer, encoded_len, bit_width);
-    for (size_t i = 0; i < values.size(); ++i) {
-      EXPECT_TRUE(decoder.Get(&out));
-      if (values[i] != out) { return false; }
-    }
-  }
-
-  // Verify batch read
-  {
-    RleDecoder decoder(buffer, len, bit_width);
-    vector<int> values_read(values.size());
-    if (static_cast<int>(values.size()) !=
-        decoder.GetBatch(values_read.data(), static_cast<int>(values.size()))) {
-      return false;
-    }
-    if (values != values_read) { return false; }
-  }
-
-  return true;
-}
-
-TEST(Rle, SpecificSequences) {
-  const int len = 1024;
-  uint8_t expected_buffer[len];
-  vector<int> values;
-
-  // Test 50 0' followed by 50 1's
-  values.resize(100);
-  for (int i = 0; i < 50; ++i) {
-    values[i] = 0;
-  }
-  for (int i = 50; i < 100; ++i) {
-    values[i] = 1;
-  }
-
-  // expected_buffer valid for bit width <= 1 byte
-  expected_buffer[0] = (50 << 1);
-  expected_buffer[1] = 0;
-  expected_buffer[2] = (50 << 1);
-  expected_buffer[3] = 1;
-  for (int width = 1; width <= 8; ++width) {
-    ValidateRle(values, width, expected_buffer, 4);
-  }
-
-  for (int width = 9; width <= MAX_WIDTH; ++width) {
-    ValidateRle(values, width, NULL, 2 * (1 + static_cast<int>(BitUtil::Ceil(width, 8))));
-  }
-
-  // Test 100 0's and 1's alternating
-  for (int i = 0; i < 100; ++i) {
-    values[i] = i % 2;
-  }
-  int num_groups = static_cast<int>(BitUtil::Ceil(100, 8));
-  expected_buffer[0] = (num_groups << 1) | 1;
-  for (int i = 1; i <= 100 / 8; ++i) {
-    expected_buffer[i] = BOOST_BINARY(1 0 1 0 1 0 1 0);
-  }
-  // Values for the last 4 0 and 1's. The upper 4 bits should be padded to 0.
-  expected_buffer[100 / 8 + 1] = BOOST_BINARY(0 0 0 0 1 0 1 0);
-
-  // num_groups and expected_buffer only valid for bit width = 1
-  ValidateRle(values, 1, expected_buffer, 1 + num_groups);
-  for (int width = 2; width <= MAX_WIDTH; ++width) {
-    int num_values = static_cast<int>(BitUtil::Ceil(100, 8)) * 8;
-    ValidateRle(
-        values, width, NULL, 1 + static_cast<int>(BitUtil::Ceil(width * num_values, 8)));
-  }
-}
-
-// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value
-// is used, otherwise alternating values are used.
-void TestRleValues(int bit_width, int num_vals, int value = -1) {
-  const uint64_t mod = (bit_width == 64) ? 1 : 1LL << bit_width;
-  vector<int> values;
-  for (int v = 0; v < num_vals; ++v) {
-    values.push_back((value != -1) ? value : (v % mod));
-  }
-  ValidateRle(values, bit_width, NULL, -1);
-}
-
-TEST(Rle, TestValues) {
-  for (int width = 1; width <= MAX_WIDTH; ++width) {
-    TestRleValues(width, 1);
-    TestRleValues(width, 1024);
-    TestRleValues(width, 1024, 0);
-    TestRleValues(width, 1024, 1);
-  }
-}
-
-TEST(Rle, BitWidthZeroRepeated) {
-  uint8_t buffer[1];
-  const int num_values = 15;
-  buffer[0] = num_values << 1;  // repeated indicator byte
-  RleDecoder decoder(buffer, sizeof(buffer), 0);
-  uint8_t val;
-  for (int i = 0; i < num_values; ++i) {
-    bool result = decoder.Get(&val);
-    EXPECT_TRUE(result);
-    EXPECT_EQ(val, 0);  // can only encode 0s with bit width 0
-  }
-  EXPECT_FALSE(decoder.Get(&val));
-}
-
-TEST(Rle, BitWidthZeroLiteral) {
-  uint8_t buffer[1];
-  const int num_groups = 4;
-  buffer[0] = num_groups << 1 | 1;  // literal indicator byte
-  RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0);
-  const int num_values = num_groups * 8;
-  uint8_t val;
-  for (int i = 0; i < num_values; ++i) {
-    bool result = decoder.Get(&val);
-    EXPECT_TRUE(result);
-    EXPECT_EQ(val, 0);  // can only encode 0s with bit width 0
-  }
-  EXPECT_FALSE(decoder.Get(&val));
-}
-
-// Test that writes out a repeated group and then a literal
-// group but flush before finishing.
-TEST(BitRle, Flush) {
-  vector<int> values;
-  for (int i = 0; i < 16; ++i)
-    values.push_back(1);
-  values.push_back(0);
-  ValidateRle(values, 1, NULL, -1);
-  values.push_back(1);
-  ValidateRle(values, 1, NULL, -1);
-  values.push_back(1);
-  ValidateRle(values, 1, NULL, -1);
-  values.push_back(1);
-  ValidateRle(values, 1, NULL, -1);
-}
-
-// Test some random sequences.
-TEST(BitRle, Random) {
-  int niters = 50;
-  int ngroups = 1000;
-  int max_group_size = 16;
-  vector<int> values(ngroups + max_group_size);
-
-  // prng setup
-  std::random_device rd;
-  std::uniform_int_distribution<int> dist(1, 20);
-
-  for (int iter = 0; iter < niters; ++iter) {
-    // generate a seed with device entropy
-    uint32_t seed = rd();
-    std::mt19937 gen(seed);
-
-    bool parity = 0;
-    values.resize(0);
-
-    for (int i = 0; i < ngroups; ++i) {
-      int group_size = dist(gen);
-      if (group_size > max_group_size) { group_size = 1; }
-      for (int i = 0; i < group_size; ++i) {
-        values.push_back(parity);
-      }
-      parity = !parity;
-    }
-    if (!CheckRoundTrip(values, BitUtil::NumRequiredBits(values.size()))) {
-      FAIL() << "failing seed: " << seed;
-    }
-  }
-}
-
-// Test a sequence of 1 0's, 2 1's, 3 0's. etc
-// e.g. 011000111100000
-TEST(BitRle, RepeatedPattern) {
-  vector<int> values;
-  const int min_run = 1;
-  const int max_run = 32;
-
-  for (int i = min_run; i <= max_run; ++i) {
-    int v = i % 2;
-    for (int j = 0; j < i; ++j) {
-      values.push_back(v);
-    }
-  }
-
-  // And go back down again
-  for (int i = max_run; i >= min_run; --i) {
-    int v = i % 2;
-    for (int j = 0; j < i; ++j) {
-      values.push_back(v);
-    }
-  }
-
-  ValidateRle(values, 1, NULL, -1);
-}
-
-TEST(BitRle, Overflow) {
-  for (int bit_width = 1; bit_width < 32; bit_width += 3) {
-    int len = RleEncoder::MinBufferSize(bit_width);
-    std::vector<uint8_t> buffer(len);
-    int num_added = 0;
-    bool parity = true;
-
-    RleEncoder encoder(buffer.data(), len, bit_width);
-    // Insert alternating true/false until there is no space left
-    while (true) {
-      bool result = encoder.Put(parity);
-      parity = !parity;
-      if (!result) break;
-      ++num_added;
-    }
-
-    int bytes_written = encoder.Flush();
-    EXPECT_LE(bytes_written, len);
-    EXPECT_GT(num_added, 0);
-
-    RleDecoder decoder(buffer.data(), bytes_written, bit_width);
-    parity = true;
-    uint32_t v;
-    for (int i = 0; i < num_added; ++i) {
-      bool result = decoder.Get(&v);
-      EXPECT_TRUE(result);
-      EXPECT_EQ(v != 0, parity);
-      parity = !parity;
-    }
-    // Make sure we get false when reading past end a couple times.
-    EXPECT_FALSE(decoder.Get(&v));
-    EXPECT_FALSE(decoder.Get(&v));
-  }
-}
-
-}  // namespace parquet
diff --git a/src/parquet/util/sse-util.h b/src/parquet/util/sse-util.h
deleted file mode 100644
index 653c1714..00000000
--- a/src/parquet/util/sse-util.h
+++ /dev/null
@@ -1,237 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
-// functions needed for parquet-cpp
-
-#ifndef PARQUET_UTIL_SSE_UTIL_H
-#define PARQUET_UTIL_SSE_UTIL_H
-
-#ifdef PARQUET_USE_SSE
-#include <emmintrin.h>
-#endif
-
-namespace parquet {
-
-/// This class contains constants useful for text processing with SSE4.2 intrinsics.
-namespace SSEUtil {
-/// Number of characters that fit in 64/128 bit register.  SSE provides instructions
-/// for loading 64 or 128 bits into a register at a time.
-static const int CHARS_PER_64_BIT_REGISTER = 8;
-static const int CHARS_PER_128_BIT_REGISTER = 16;
-
-/// SSE4.2 adds instructions for text processing.  The instructions have a control
-/// byte that determines some of functionality of the instruction.  (Equivalent to
-/// GCC's _SIDD_CMP_EQUAL_ANY, etc).
-static const int PCMPSTR_EQUAL_ANY = 0x00;     // strchr
-static const int PCMPSTR_EQUAL_EACH = 0x08;    // strcmp
-static const int PCMPSTR_UBYTE_OPS = 0x00;     // unsigned char (8-bits, rather than 16)
-static const int PCMPSTR_NEG_POLARITY = 0x10;  // see Intel SDM chapter 4.1.4.
-
-/// In this mode, SSE text processing functions will return a mask of all the
-/// characters that matched.
-static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS;
-
-/// In this mode, SSE text processing functions will return the number of
-/// bytes that match consecutively from the beginning.
-static const int STRCMP_MODE =
-    PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY;
-
-/// Precomputed mask values up to 16 bits.
-static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = {
-    1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9,
-    1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15,
-};
-}  // namespace SSEUtil
-
-#ifdef PARQUET_USE_SSE
-
-/// Define the SSE 4.2 intrinsics.  The caller must first verify at runtime (or codegen
-/// IR load time) that the processor supports SSE 4.2 before calling these.  These are
-/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros.
-#ifndef IR_COMPILE
-/// When compiling to native code (i.e. not IR), we cannot use the -msse4.2 compiler
-/// flag.  Otherwise, the compiler will emit SSE 4.2 instructions outside of the runtime
-/// SSE 4.2 checks and Impala will crash on CPUs that don't support SSE 4.2
-/// (IMPALA-1399/1646).  The compiler intrinsics cannot be used without -msse4.2, so we
-/// define our own implementations of the intrinsics instead.
-
-/// The PCMPxSTRy instructions require that the control byte 'mode' be encoded as an
-/// immediate.  So, those need to be always inlined in order to always propagate the
-/// mode constant into the inline asm.
-#define SSE_ALWAYS_INLINE inline __attribute__((__always_inline__))
-
-template <int MODE>
-static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
-#ifdef __clang__
-  /// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 -
-  /// clang doesn't support Y-prefixed asm constraints.
-  register volatile __m128i result asm("xmm0");
-  __asm__ volatile("pcmpestrm %5, %2, %1"
-                   : "=x"(result)
-                   : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
-                   : "cc");
-#else
-  __m128i result;
-  __asm__ volatile("pcmpestrm %5, %2, %1"
-                   : "=Yz"(result)
-                   : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
-                   : "cc");
-#endif
-  return result;
-}
-
-template <int MODE>
-static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
-  int result;
-  __asm__("pcmpestri %5, %2, %1"
-          : "=c"(result)
-          : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
-          : "cc");
-  return result;
-}
-
-static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
-  __asm__("crc32b %1, %0" : "+r"(crc) : "rm"(v));
-  return crc;
-}
-
-static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
-  __asm__("crc32w %1, %0" : "+r"(crc) : "rm"(v));
-  return crc;
-}
-
-static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
-  __asm__("crc32l %1, %0" : "+r"(crc) : "rm"(v));
-  return crc;
-}
-
-static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
-  uint64_t result = crc;
-  __asm__("crc32q %1, %0" : "+r"(result) : "rm"(v));
-  return result;
-}
-
-static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
-  int64_t result;
-  __asm__("popcntq %1, %0" : "=r"(result) : "mr"(a) : "cc");
-  return result;
-}
-
-#undef SSE_ALWAYS_INLINE
-
-#elif defined(__SSE4_2__)  // IR_COMPILE for SSE 4.2.
-/// When cross-compiling to IR, we cannot use inline asm because LLVM JIT does not
-/// support it.  However, the cross-compiled IR is compiled twice: with and without
-/// -msse4.2.  When -msse4.2 is enabled in the cross-compile, we can just use the
-/// compiler intrinsics.
-
-#include <smmintrin.h>
-
-template <int MODE>
-static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
-  return _mm_cmpestrm(str1, len1, str2, len2, MODE);
-}
-
-template <int MODE>
-static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
-  return _mm_cmpestri(str1, len1, str2, len2, MODE);
-}
-
-#define SSE4_crc32_u8 _mm_crc32_u8
-#define SSE4_crc32_u16 _mm_crc32_u16
-#define SSE4_crc32_u32 _mm_crc32_u32
-#define SSE4_crc32_u64 _mm_crc32_u64
-#define POPCNT_popcnt_u64 _mm_popcnt_u64
-
-#else  // IR_COMPILE without SSE 4.2.
-/// When cross-compiling to IR without SSE 4.2 support (i.e. no -msse4.2), we cannot use
-/// SSE 4.2 instructions.  Otherwise, the IR loading will fail on CPUs that don't
-/// support SSE 4.2.  However, because the caller isn't allowed to call these routines
-/// on CPUs that lack SSE 4.2 anyway, we can implement stubs for this case.
-
-template <int MODE>
-static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return (__m128i){0};  // NOLINT
-}
-
-template <int MODE>
-static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-#endif  // IR_COMPILE
-
-#else
-
-static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
-  DCHECK(false) << "SSE support is not enabled";
-  return 0;
-}
-
-static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
-  DCHECK(false) << "SSE support is not enabled";
-  return 0;
-}
-
-static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
-  DCHECK(false) << "SSE support is not enabled";
-  return 0;
-}
-
-static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
-  DCHECK(false) << "SSE support is not enabled";
-  return 0;
-}
-
-static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
-  DCHECK(false) << "SSE support is not enabled";
-  return 0;
-}
-
-#endif  // PARQUET_USE_SSE
-
-}  // namespace parquet
-
-#endif  //  PARQUET_UTIL_SSE_UTIL_H